From bc7bd215cd09f8a82da9522f8036ee7e35cdaebd Mon Sep 17 00:00:00 2001 From: yu-hsieh Date: Tue, 22 Aug 2023 16:01:28 +0000 Subject: [PATCH] add navi31 F32 logic yaml --- .../navi31/navi31_Cijk_Ailk_Bjlk_SB.yaml | 64724 ++++++++++- .../navi31/navi31_Cijk_Ailk_Bjlk_SB_GB.yaml | 64892 +++++++++++ .../navi31/navi31_Cijk_Ailk_Bljk_SB.yaml | 96200 +++++++++++++++- .../navi31/navi31_Cijk_Ailk_Bljk_SB_GB.yaml | 58188 ++++++++++ .../navi31/navi31_Cijk_Alik_Bjlk_SB.yaml | 31752 ++++- .../navi31/navi31_Cijk_Alik_Bjlk_SB_GB.yaml | 31912 +++++ .../navi31/navi31_Cijk_Alik_Bljk_SB.yaml | 80568 ++++++++++++- .../navi31/navi31_Cijk_Alik_Bljk_SB_GB.yaml | 66118 +++++++++++ 8 files changed, 494142 insertions(+), 212 deletions(-) create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB_GB.yaml diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB.yaml index 978c26773..101dbc6e0 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB.yaml @@ -1,7 +1,7 @@ -- {MinimumRequiredVersion: 4.33.0} +- {MinimumRequiredVersion: 4.26.0} - navi31 - gfx1100 -- [Device 6863] +- [Device 744c] - AllowNoFreeDims: false AssignedDerivedParameters: true Batched: true @@ -11,7 +11,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -55,16 +54,55288 @@ ZeroPadB: [] - - 1LDSBuffer: 0 AggressivePerfMode: 1 - AssertAlphaValue: false - AssertBetaValue: false - AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x16_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 4 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 0 AssertSizeEqual: {} - AssertSizeGreaterThan: {} - AssertSizeLessThan: {} - AssertSizeMultiple: {} AssertStrideAEqual: {0: 1} AssertStrideBEqual: {0: 1} AssertStrideCEqual: {0: 1} @@ -72,27 +55343,22 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - AtomicAddC: false BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false CodeObjectVersion: default - CustomKernelName: '' - DepthU: 8 + DepthU: 32 DepthULdsDivisor: 1 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false DisableAtomicFail: 0 DisableKernelPieces: 0 DisableVgprOverlapping: false EdgeType: ShiftPtr EnableMatrixInstruction: false - ExpandPointerSwap: 0 - Fp16AltImpl: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -102,36 +55368,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadPerMfma: 1 GlobalReadVectorWidth: 1 GlobalSplitU: 1 - GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [11, 0, 0] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 LSPA: 8 - LSPB: 8 - LVCA: 32 + LSPB: 2 + LVCA: 8 LVCB: 32 LVPA: 8 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false LdsBlockSizePerPad: 0 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 - LdsInitCVgprs: false - LdsNumElements: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -141,18 +55408,16 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false - LoopIters: 8 + LoopIters: 32 LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MACInstruction: FMA - MIArchVgpr: false - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -161,23 +55426,18 @@ MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 - NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 OptNoLoadLoop: 1 OptPreLoopVmcnt: 0 PackBatchDims: 0 @@ -194,8 +55454,7 @@ PersistentKernel: 0 PersistentKernelAlongBatch: false PrefetchAcrossPersistent: 0 - PrefetchAcrossPersistentMode: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AllowNoFreeDims: false @@ -207,7 +55466,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -253,30 +55511,245 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_ - SourceSwap: false - StaggerU: 32 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StoreCInUnroll: false - StoreCInUnrollExact: false - StoreCInUnrollInterval: 1 - StoreCInUnrollPostLoop: false - StorePriorityOpt: false + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SN_SU32_SUM3_TT1_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 StoreVectorWidth: 4 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 TransposeLDS: 0 UnrollIncIsDepthU: 0 UnrollMajorLDSA: 0 @@ -291,20 +55764,9129 @@ VectorWidth: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WavefrontSize: 32 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _DepthULds: 8 + _DepthULds: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 3 - allowLRVWforTLUandMI: false + _staggerStrideShift: 0 - [2, 3, 0, 1] -- - - [126, 126, 2, 66, 126, 126, 126, 126] - - [0, 0] -- null +- - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [17, 18267.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [5, 17183.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [33, 18648.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [17, 18594.0] + - - [3072, 768, 1, 4096, 3072, 3072, 3072, 768] + - [38, 19943.0] + - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] + - [33, 18813.0] + - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] + - [17, 18839.0] + - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] + - [33, 18466.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [21, 20100.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [18, 19114.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [33, 19146.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [3, 19493.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [9, 17238.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [37, 20476.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [3, 18253.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 5056] + - [33, 18166.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [37, 20233.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [34, 19309.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [20, 18702.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [21, 19356.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [17, 16814.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [20, 18788.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [37, 20128.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [9, 15038.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 1408] + - [35, 15149.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [38, 19306.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [37, 20352.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [0, 16082.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [34, 18858.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [37, 20134.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [5, 17229.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [33, 17843.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [38, 15534.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [3, 19113.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [33, 18773.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [3, 16422.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [33, 17937.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [21, 20715.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [18, 19582.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 6784] + - [33, 18839.0] + - - [704, 5056, 1, 128, 704, 704, 704, 5056] + - [47, 15486.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [3, 18685.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [37, 20786.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [17, 19155.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [21, 20049.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [17, 17574.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [37, 20258.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [37, 19800.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [33, 18143.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 2944] + - [17, 17442.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [1, 19574.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [20, 18355.0] + - - [448, 5888, 1, 128, 448, 448, 448, 5888] + - [33, 14259.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [17, 19271.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [17, 14796.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [1, 20479.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 2944] + - [17, 19280.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [0, 16792.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [36, 18790.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 6784] + - [17, 18365.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [33, 18127.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [33, 17984.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [0, 18929.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [33, 19183.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [15, 16951.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [37, 20838.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [20, 18278.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [33, 14689.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [17, 15416.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [5, 18665.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [20, 19047.0] + - - [704, 2944, 1, 128, 704, 704, 704, 2944] + - [0, 14545.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [33, 18072.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [5, 17377.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [33, 18281.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 1408] + - [0, 18334.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [37, 20147.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [36, 19427.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [1, 19827.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 4288] + - [9, 18130.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [18, 19707.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [37, 20184.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [37, 20613.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [34, 20209.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [37, 19816.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [38, 17542.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [37, 20585.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [33, 19227.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [4, 20162.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 5888] + - [38, 19446.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [34, 17693.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 3584] + - [38, 18344.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [37, 20950.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [38, 19537.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [15, 17674.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 2368] + - [17, 18699.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [33, 17437.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [6, 17821.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [4, 20159.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [1, 19855.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [20, 17576.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [13, 19146.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [17, 17633.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [0, 18208.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [21, 19710.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [17, 18826.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 448] + - [0, 16063.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [34, 18328.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [4, 20880.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [1, 19591.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [17, 15847.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [33, 18006.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [21, 20330.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 5888] + - [5, 19285.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 3584] + - [38, 17923.0] + - - [448, 3584, 1, 128, 448, 448, 448, 3584] + - [24, 12686.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [4, 20393.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 5888] + - [33, 18680.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [1, 20253.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 704] + - [33, 14360.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [37, 20546.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 2368] + - [33, 18959.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 704] + - [9, 16665.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [36, 19134.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [37, 20170.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [4, 20833.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [37, 20248.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [9, 16740.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 5888] + - [33, 18323.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [21, 20585.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [4, 20320.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [33, 19183.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [3, 19054.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [18, 18932.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [22, 19299.0] + - - [256, 5056, 1, 128, 256, 256, 256, 5056] + - [33, 14088.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [5, 18945.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [37, 19368.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [44, 16128.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 1408] + - [9, 18124.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [25, 19649.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [0, 16394.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [17, 19154.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 2368] + - [33, 19289.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [10, 19534.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [17, 17394.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [37, 20276.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [33, 17438.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 2944] + - [38, 19695.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [33, 19123.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 1856] + - [33, 18887.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 1024] + - [0, 15628.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 3584] + - [0, 19056.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [37, 20940.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [5, 19272.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [33, 19088.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [4, 20391.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [17, 15807.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [33, 17118.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [33, 15100.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [37, 19748.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 704] + - [33, 15720.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [10, 19624.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 5888] + - [1, 20003.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [38, 17591.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [37, 19411.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [10, 19457.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [20, 19040.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 4288] + - [9, 19165.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 1856] + - [0, 16997.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [21, 19430.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [37, 20378.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 2368] + - [33, 18295.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 4288] + - [0, 17791.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [0, 18215.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [33, 19333.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 6784] + - [1, 19927.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [1, 19829.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [33, 16157.0] + - - [448, 4288, 1, 128, 448, 448, 448, 4288] + - [33, 13661.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [4, 20216.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [19, 16931.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 4288] + - [17, 18391.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [38, 17504.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [37, 19927.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [17, 19565.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [36, 19428.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [5, 18679.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [1, 20133.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 2368] + - [17, 18712.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [21, 20199.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [25, 20146.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 3584] + - [15, 17534.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [17, 17547.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 6784] + - [34, 19702.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [17, 18827.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [0, 16697.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [1, 19647.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [4, 20525.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 5888] + - [1, 19832.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 5888] + - [18, 19353.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [20, 19304.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [33, 19492.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [18, 19276.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [21, 20267.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [38, 17393.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 704] + - [33, 17823.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [33, 18936.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [5, 17319.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 2368] + - [0, 17235.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [4, 20778.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [17, 18087.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 6784] + - [43, 19735.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [33, 17242.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [33, 15508.0] + - - [256, 5888, 1, 128, 256, 256, 256, 5888] + - [0, 14356.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [4, 20180.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [33, 19373.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [3, 18734.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 4288] + - [33, 19496.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [33, 18435.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 704] + - [33, 16478.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 5056] + - [0, 19334.0] + - - [448, 5056, 1, 128, 448, 448, 448, 5056] + - [0, 13715.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 5056] + - [33, 18423.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 3584] + - [38, 19433.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [0, 19295.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [4, 20311.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 5056] + - [17, 18491.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [14, 20567.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [4, 20422.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [21, 20223.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [33, 17904.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [33, 16303.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [4, 20449.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [37, 20856.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [4, 20277.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 3584] + - [38, 19593.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 1856] + - [9, 16234.0] + - - [704, 3584, 1, 128, 704, 704, 704, 3584] + - [17, 14614.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [17, 17507.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [34, 19668.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 2944] + - [5, 18992.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [4, 20132.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [21, 20251.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [38, 19757.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [33, 18389.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 3584] + - [38, 19449.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [17, 17486.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [21, 19416.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [38, 16905.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [37, 20705.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [5, 18570.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [33, 17769.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 4288] + - [33, 19550.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [33, 18648.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [20, 19200.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [38, 18099.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [1, 19378.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 3584] + - [33, 17322.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 1408] + - [0, 15177.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 2944] + - [17, 17918.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 2944] + - [5, 18999.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [21, 20409.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 2368] + - [33, 16842.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 2368] + - [33, 19153.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [37, 20463.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [21, 20118.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [9, 18503.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [37, 19766.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [3, 19091.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 5056] + - [17, 19329.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [34, 18974.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [37, 20121.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [5, 15879.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [4, 20248.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [20, 19107.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 448] + - [33, 14914.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [17, 18604.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [33, 18753.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [17, 18816.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [4, 20793.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [37, 20104.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [38, 18651.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [1, 20100.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [38, 19381.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 1408] + - [17, 17408.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [37, 20699.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [24, 19312.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [37, 19313.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [1, 19706.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [9, 18585.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [33, 18075.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [1, 20388.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [5, 18607.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [22, 17257.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [33, 18316.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [33, 19224.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 2944] + - [22, 19009.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [37, 20673.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [20, 19724.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [1, 19491.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [21, 20370.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [47, 16743.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [37, 19234.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [38, 18487.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [38, 17078.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [33, 18216.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [33, 19373.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [17, 17476.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 2368] + - [33, 16015.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [37, 19819.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [33, 18019.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [4, 20099.0] + - - [448, 6784, 1, 128, 448, 448, 448, 6784] + - [17, 14647.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [4, 20304.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [0, 18654.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [34, 19095.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [21, 19288.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 448] + - [17, 14813.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [17, 15989.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [38, 17718.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [17, 19458.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [0, 17085.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [3, 19514.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [33, 17522.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [33, 19483.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [33, 18750.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [17, 17775.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [0, 17381.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 2368] + - [0, 17679.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 1408] + - [33, 18874.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [1, 19863.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [17, 18761.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 3584] + - [17, 19076.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [46, 19979.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 1024] + - [33, 14498.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [5, 15987.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [37, 19649.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [37, 19614.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [22, 16607.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [1, 18875.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [17, 16336.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [38, 15785.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [5, 17516.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 1024] + - [22, 18879.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [0, 18114.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 5056] + - [33, 19280.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [34, 19221.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [4, 19745.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 6784] + - [34, 19263.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [37, 20758.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 1856] + - [9, 18062.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 2944] + - [0, 17372.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 448] + - [17, 16899.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [36, 16254.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 1856] + - [9, 16965.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [9, 16918.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [33, 17943.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 1024] + - [33, 17463.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [22, 16835.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [20, 19257.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [18, 19805.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [22, 15276.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [21, 20059.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 1024] + - [33, 18340.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [37, 20472.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [1, 20011.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [33, 18378.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [22, 18904.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 1856] + - [33, 18271.0] + - - [256, 6784, 1, 128, 256, 256, 256, 6784] + - [33, 16179.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 6784] + - [38, 19579.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 5056] + - [0, 19080.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 5888] + - [0, 18276.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [1, 19953.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [0, 18967.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [1, 20032.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [38, 17156.0] + - - [704, 5888, 1, 128, 704, 704, 704, 5888] + - [38, 16088.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 3584] + - [1, 19593.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [37, 20217.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 1408] + - [33, 15454.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [24, 19004.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [0, 18068.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [17, 13943.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [33, 17404.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [17, 18456.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [33, 18260.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [4, 20205.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [34, 20102.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [3, 18911.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [37, 19677.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [5, 16137.0] + - - [704, 4288, 1, 128, 704, 704, 704, 4288] + - [22, 14907.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 4288] + - [0, 18389.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 6784] + - [22, 18720.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [33, 18287.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [47, 16207.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [22, 17745.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 5056] + - [24, 19543.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [17, 14360.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [3, 19726.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [17, 17392.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [20, 18494.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 256] + - [9, 12764.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 5888] + - [33, 18630.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [33, 16792.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 1856] + - [17, 15655.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [37, 20648.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [38, 17367.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [4, 20070.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [13, 18712.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [1, 18906.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [1, 19973.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [37, 18703.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [37, 19634.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [33, 17919.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [0, 19209.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 1024] + - [33, 16777.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [21, 20123.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [37, 20842.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 5056] + - [33, 19534.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [21, 19388.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [33, 15241.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [17, 14942.0] + - - [704, 2368, 1, 128, 704, 704, 704, 2368] + - [17, 13523.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 256] + - [33, 14166.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 1856] + - [33, 18566.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 4288] + - [0, 19011.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [33, 17631.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [17, 18736.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [5, 18899.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [33, 16740.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 6784] + - [1, 19927.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [37, 20593.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [33, 17927.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [37, 20216.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [33, 19062.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 5888] + - [38, 19713.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [1, 19910.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [0, 16293.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [0, 18731.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [1, 19269.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [1, 19388.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [4, 19252.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [33, 18246.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [37, 20698.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [37, 19272.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [33, 17405.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [22, 17104.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [38, 16507.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [0, 18698.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [33, 19129.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [9, 14558.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [21, 19818.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [38, 17274.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [37, 20485.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [1, 19141.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [33, 18475.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [9, 18640.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 1408] + - [33, 18584.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [37, 19688.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [1, 19552.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [9, 18630.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [22, 17979.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 1024] + - [5, 18256.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [1, 20031.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 256] + - [38, 16016.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 704] + - [17, 17419.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [0, 19385.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [20, 19248.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [5, 18842.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [34, 19772.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [33, 17662.0] + - - [704, 6784, 1, 128, 704, 704, 704, 6784] + - [33, 16549.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [1, 19811.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [37, 20563.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [34, 19807.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 6784] + - [38, 19015.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [5, 15919.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [43, 19673.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [33, 18089.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 5056] + - [9, 18019.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [21, 20832.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [17, 15694.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 448] + - [9, 15878.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 2944] + - [38, 19405.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [5, 19094.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [21, 19720.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [22, 18607.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 4288] + - [42, 19158.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [14, 20358.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [17, 15644.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 1408] + - [22, 17952.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [18, 19237.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [33, 17986.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [37, 20764.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [22, 18722.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [36, 17627.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [45, 19796.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 2944] + - [0, 16309.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [33, 18587.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [21, 20371.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [9, 18068.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [34, 20014.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [22, 18858.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [5, 18530.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 1856] + - [33, 19228.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 704] + - [33, 15496.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [47, 16641.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [46, 20304.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [37, 20795.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 1408] + - [2, 16117.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 1024] + - [33, 15431.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [37, 20545.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [17, 19161.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [38, 18682.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 4288] + - [33, 19236.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [18, 19528.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [5, 17685.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [33, 16446.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 1856] + - [17, 18473.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [0, 18889.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [4, 20097.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [17, 17378.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [37, 20904.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [5, 16919.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [33, 18224.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 1024] + - [0, 17062.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [4, 15443.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [0, 16756.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [22, 16839.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 10082.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [33, 17074.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [3, 14704.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 11077.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [17, 15910.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [17, 15030.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [17, 15136.0] + - - [5329, 160, 64, 64, 5329, 5329, 5329, 160] + - [0, 7550.0] + - - [1225, 384, 64, 192, 1225, 1225, 1225, 384] + - [17, 18282.0] + - - [289, 1024, 64, 256, 289, 289, 289, 1024] + - [22, 14868.0] + - - [1225, 384, 64, 64, 1225, 1225, 1225, 384] + - [0, 15328.0] + - - [1225, 384, 64, 96, 1225, 1225, 1225, 384] + - [0, 14647.0] + - - [289, 1024, 64, 384, 289, 289, 289, 1024] + - [1, 15079.0] + - - [289, 1024, 64, 192, 289, 289, 289, 1024] + - [5, 14752.0] + - - [289, 1024, 64, 128, 289, 289, 289, 1024] + - [17, 14493.0] + - - [4096, 1024, 1, 2984, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [1024, 4096, 1, 3437, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [1024, 4096, 1, 3235, 1024, 1024, 1024, 4096] + - [0, 18593.0] + - - [4096, 1024, 1, 4032, 4096, 4096, 4096, 1024] + - [33, 18603.0] + - - [1024, 4096, 1, 3334, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3288, 4096, 4096, 4096, 1024] + - [17, 18648.0] + - - [1024, 4096, 1, 3515, 1024, 1024, 1024, 4096] + - [17, 18586.0] + - - [4096, 1024, 1, 3437, 4096, 4096, 4096, 1024] + - [42, 18614.0] + - - [1024, 4096, 1, 3259, 1024, 1024, 1024, 4096] + - [33, 18594.0] + - - [1024, 4096, 1, 3384, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3458, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [1024, 4096, 1, 3412, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3529, 1024, 1024, 1024, 4096] + - [9, 18595.0] + - - [1024, 4096, 1, 4032, 1024, 1024, 1024, 4096] + - [33, 18614.0] + - - [4096, 1024, 1, 3999, 4096, 4096, 4096, 1024] + - [17, 18604.0] + - - [1024, 4096, 1, 3079, 1024, 1024, 1024, 4096] + - [17, 18581.0] + - - [1024, 4096, 1, 3876, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 3450, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3256, 1024, 1024, 1024, 4096] + - [17, 18608.0] + - - [4096, 1024, 1, 3403, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 4096, 1, 3359, 1024, 1024, 1024, 4096] + - [24, 18604.0] + - - [4096, 1024, 1, 3549, 4096, 4096, 4096, 1024] + - [33, 18598.0] + - - [4096, 1024, 1, 3176, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [1024, 4096, 1, 3504, 1024, 1024, 1024, 4096] + - [42, 18608.0] + - - [4096, 1024, 1, 3314, 4096, 4096, 4096, 1024] + - [33, 18607.0] + - - [4096, 1024, 1, 3183, 4096, 4096, 4096, 1024] + - [17, 18575.0] + - - [1024, 4096, 1, 3209, 1024, 1024, 1024, 4096] + - [0, 18615.0] + - - [1024, 4096, 1, 3720, 1024, 1024, 1024, 4096] + - [0, 18637.0] + - - [1024, 4096, 1, 3859, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [1024, 33708, 1, 4059, 1024, 1024, 1024, 33708] + - [4, 20849.0] + - - [4096, 1024, 1, 3477, 4096, 4096, 4096, 1024] + - [24, 18588.0] + - - [4096, 1024, 1, 3233, 4096, 4096, 4096, 1024] + - [17, 18590.0] + - - [4096, 1024, 1, 3409, 4096, 4096, 4096, 1024] + - [33, 18591.0] + - - [4096, 1024, 1, 3564, 4096, 4096, 4096, 1024] + - [0, 18606.0] + - - [4096, 1024, 1, 3190, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3288, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3451, 4096, 4096, 4096, 1024] + - [0, 18590.0] + - - [1024, 4096, 1, 3348, 1024, 1024, 1024, 4096] + - [9, 18597.0] + - - [1024, 4096, 1, 3465, 1024, 1024, 1024, 4096] + - [0, 18605.0] + - - [1024, 33708, 1, 4032, 1024, 1024, 1024, 33708] + - [21, 20870.0] + - - [1024, 33708, 1, 3840, 1024, 1024, 1024, 33708] + - [4, 20855.0] + - - [4096, 1024, 1, 3391, 4096, 4096, 4096, 1024] + - [33, 18593.0] + - - [1024, 4096, 1, 3530, 1024, 1024, 1024, 4096] + - [33, 18585.0] + - - [4096, 1024, 1, 3209, 4096, 4096, 4096, 1024] + - [33, 18603.0] + - - [1024, 4096, 1, 3457, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3386, 1024, 1024, 1024, 4096] + - [0, 18595.0] + - - [4096, 1024, 1, 3350, 4096, 4096, 4096, 1024] + - [17, 18587.0] + - - [1024, 4096, 1, 3184, 1024, 1024, 1024, 4096] + - [17, 18595.0] + - - [1024, 4096, 1, 3093, 1024, 1024, 1024, 4096] + - [0, 18621.0] + - - [1024, 4096, 1, 3400, 1024, 1024, 1024, 4096] + - [33, 18607.0] + - - [1024, 4096, 1, 3214, 1024, 1024, 1024, 4096] + - [0, 18592.0] + - - [4096, 1024, 1, 3406, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3565, 1024, 1024, 1024, 4096] + - [17, 18600.0] + - - [4096, 1024, 1, 3536, 4096, 4096, 4096, 1024] + - [33, 18597.0] + - - [1024, 4096, 1, 3183, 1024, 1024, 1024, 4096] + - [17, 18596.0] + - - [1024, 4096, 1, 3462, 1024, 1024, 1024, 4096] + - [33, 18598.0] + - - [4096, 1024, 1, 3130, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3381, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3298, 4096, 4096, 4096, 1024] + - [42, 18624.0] + - - [1024, 4096, 1, 3292, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [4096, 1024, 1, 3289, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3379, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3990, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3540, 1024, 1024, 1024, 4096] + - [33, 18606.0] + - - [4096, 1024, 1, 3412, 4096, 4096, 4096, 1024] + - [0, 18610.0] + - - [1024, 4096, 1, 3555, 1024, 1024, 1024, 4096] + - [42, 18603.0] + - - [1024, 4096, 1, 3518, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3189, 4096, 4096, 4096, 1024] + - [17, 18591.0] + - - [1024, 4096, 1, 3298, 1024, 1024, 1024, 4096] + - [24, 18601.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [0, 18619.0] + - - [1024, 4096, 1, 3393, 1024, 1024, 1024, 4096] + - [33, 18600.0] + - - [1024, 4096, 1, 3207, 1024, 1024, 1024, 4096] + - [9, 18586.0] + - - [4096, 1024, 1, 3487, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3431, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3378, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [4096, 1024, 1, 3529, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3460, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3336, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3501, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3584, 1024, 1024, 1024, 4096] + - [0, 18639.0] + - - [4096, 1024, 1, 2499, 4096, 4096, 4096, 1024] + - [17, 18605.0] + - - [4096, 1024, 1, 3352, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3543, 1024, 1024, 1024, 4096] + - [17, 18619.0] + - - [1024, 4096, 1, 3476, 1024, 1024, 1024, 4096] + - [0, 18629.0] + - - [1024, 33708, 1, 3822, 1024, 1024, 1024, 33708] + - [4, 20869.0] + - - [1024, 4096, 1, 3436, 1024, 1024, 1024, 4096] + - [33, 18584.0] + - - [1024, 4096, 1, 3594, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3514, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [1024, 4096, 1, 3064, 1024, 1024, 1024, 4096] + - [24, 18608.0] + - - [4096, 1024, 1, 3371, 4096, 4096, 4096, 1024] + - [42, 18613.0] + - - [4096, 1024, 1, 3558, 4096, 4096, 4096, 1024] + - [33, 18642.0] + - - [4096, 1024, 1, 3517, 4096, 4096, 4096, 1024] + - [17, 18601.0] + - - [4096, 1024, 1, 3144, 4096, 4096, 4096, 1024] + - [0, 18616.0] + - - [1024, 4096, 1, 3312, 1024, 1024, 1024, 4096] + - [24, 18606.0] + - - [4096, 1024, 1, 3079, 4096, 4096, 4096, 1024] + - [17, 18619.0] + - - [1024, 4096, 1, 3415, 1024, 1024, 1024, 4096] + - [9, 18609.0] + - - [1024, 4096, 1, 3221, 1024, 1024, 1024, 4096] + - [17, 18613.0] + - - [1024, 4096, 1, 3978, 1024, 1024, 1024, 4096] + - [33, 18599.0] + - - [4096, 1024, 1, 3876, 4096, 4096, 4096, 1024] + - [42, 18623.0] + - - [1024, 4096, 1, 3528, 1024, 1024, 1024, 4096] + - [24, 18609.0] + - - [1024, 4096, 1, 3181, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3445, 4096, 4096, 4096, 1024] + - [0, 18617.0] + - - [4096, 1024, 1, 3450, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [4096, 1024, 1, 3377, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3532, 1024, 1024, 1024, 4096] + - [33, 18622.0] + - - [1024, 33708, 1, 3944, 1024, 1024, 1024, 33708] + - [21, 20878.0] + - - [4096, 1024, 1, 3483, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3358, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3464, 4096, 4096, 4096, 1024] + - [33, 18610.0] + - - [4096, 1024, 1, 3282, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [4096, 1024, 1, 3256, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [1024, 4096, 1, 3057, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 3481, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3340, 4096, 4096, 4096, 1024] + - [42, 18617.0] + - - [1024, 4096, 1, 3273, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [4096, 1024, 1, 3392, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3337, 4096, 4096, 4096, 1024] + - [17, 18615.0] + - - [4096, 1024, 1, 3359, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [4096, 1024, 1, 3498, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3169, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 33708, 1, 3859, 1024, 1024, 1024, 33708] + - [37, 20881.0] + - - [1024, 4096, 1, 3103, 1024, 1024, 1024, 4096] + - [9, 18583.0] + - - [4096, 1024, 1, 3900, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3442, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3248, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3351, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [4096, 1024, 1, 3593, 4096, 4096, 4096, 1024] + - [17, 18643.0] + - - [1024, 4096, 1, 3780, 1024, 1024, 1024, 4096] + - [17, 18648.0] + - - [1024, 33708, 1, 3681, 1024, 1024, 1024, 33708] + - [21, 20865.0] + - - [4096, 1024, 1, 3374, 4096, 4096, 4096, 1024] + - [17, 18607.0] + - - [1024, 4096, 1, 3557, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3906, 4096, 4096, 4096, 1024] + - [17, 18600.0] + - - [4096, 1024, 1, 3504, 4096, 4096, 4096, 1024] + - [33, 18618.0] + - - [1024, 4096, 1, 3270, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [4096, 1024, 1, 3098, 4096, 4096, 4096, 1024] + - [0, 18612.0] + - - [4096, 1024, 1, 3216, 4096, 4096, 4096, 1024] + - [24, 18620.0] + - - [1024, 4096, 1, 3550, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3449, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [1024, 4096, 1, 3403, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3523, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3486, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3564, 1024, 1024, 1024, 4096] + - [0, 18634.0] + - - [1024, 33708, 1, 4005, 1024, 1024, 1024, 33708] + - [37, 20869.0] + - - [4096, 1024, 1, 3296, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3263, 1024, 1024, 1024, 4096] + - [42, 18614.0] + - - [1024, 4096, 1, 3130, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3295, 1024, 1024, 1024, 4096] + - [0, 18642.0] + - - [1024, 33708, 1, 3925, 1024, 1024, 1024, 33708] + - [4, 20879.0] + - - [1024, 4096, 1, 3378, 1024, 1024, 1024, 4096] + - [17, 18621.0] + - - [4096, 1024, 1, 3720, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3399, 4096, 4096, 4096, 1024] + - [42, 18620.0] + - - [4096, 1024, 1, 3543, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [4096, 1024, 1, 3497, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3594, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3144, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 4096, 1, 3975, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3205, 4096, 4096, 4096, 1024] + - [33, 18616.0] + - - [1024, 33708, 1, 3995, 1024, 1024, 1024, 33708] + - [4, 20864.0] + - - [1024, 4096, 1, 3392, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [1024, 4096, 1, 3055, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 4026, 1024, 1024, 1024, 4096] + - [0, 18614.0] + - - [4096, 1024, 1, 3557, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [4096, 1024, 1, 3515, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3486, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3457, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [1024, 4096, 1, 3511, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [4096, 1024, 1, 3138, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [1024, 4096, 1, 3339, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3939, 1024, 1024, 1024, 4096] + - [17, 18639.0] + - - [4096, 1024, 1, 3500, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [4096, 1024, 1, 3395, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 4020, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3942, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3349, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 4096, 1, 3322, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [4096, 1024, 1, 3452, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 4096, 1, 3417, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 3526, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [4096, 1024, 1, 3485, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3303, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [4096, 1024, 1, 3344, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3479, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [4096, 1024, 1, 3300, 4096, 4096, 4096, 1024] + - [17, 18614.0] + - - [1024, 4096, 1, 3439, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [4096, 1024, 1, 3280, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3245, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3328, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3418, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [1024, 4096, 1, 3493, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3500, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [1024, 4096, 1, 3166, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [4096, 1024, 1, 3126, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3277, 1024, 1024, 1024, 4096] + - [17, 18623.0] + - - [1024, 4096, 1, 3315, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [1024, 4096, 1, 3414, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3531, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [4096, 1024, 1, 3484, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3180, 1024, 1024, 1024, 4096] + - [17, 18618.0] + - - [4096, 1024, 1, 3360, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 33708, 1, 3990, 1024, 1024, 1024, 33708] + - [4, 20876.0] + - - [4096, 1024, 1, 3466, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3428, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [1024, 4096, 1, 3137, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 4059, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [1024, 4096, 1, 3353, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [1024, 4096, 1, 3942, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3506, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3508, 4096, 4096, 4096, 1024] + - [17, 18609.0] + - - [4096, 1024, 1, 3956, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3272, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3443, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3375, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 4096, 1, 3525, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3472, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [1024, 4096, 1, 3520, 1024, 1024, 1024, 4096] + - [33, 18645.0] + - - [4096, 1024, 1, 3322, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 3387, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 33708, 1, 3939, 1024, 1024, 1024, 33708] + - [37, 20880.0] + - - [4096, 1024, 1, 3345, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 2967, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3453, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 4096, 1, 3640, 1024, 1024, 1024, 4096] + - [33, 18645.0] + - - [4096, 1024, 1, 3291, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3350, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3417, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [1024, 4096, 1, 3467, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3491, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3822, 1024, 1024, 1024, 4096] + - [17, 18649.0] + - - [4096, 1024, 1, 3292, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [1024, 4096, 1, 3231, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3364, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3995, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3545, 1024, 1024, 1024, 4096] + - [17, 18655.0] + - - [1024, 4096, 1, 3186, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3432, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3367, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3503, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3095, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [4096, 1024, 1, 3465, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3402, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3140, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3424, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3257, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [4096, 1024, 1, 2917, 4096, 4096, 4096, 1024] + - [42, 18621.0] + - - [1024, 33708, 1, 3640, 1024, 1024, 1024, 33708] + - [37, 20877.0] + - - [1024, 4096, 1, 3456, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [1024, 4096, 1, 3014, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [4096, 1024, 1, 3372, 4096, 4096, 4096, 1024] + - [33, 18640.0] + - - [1024, 4096, 1, 3294, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3446, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3389, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3259, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3544, 4096, 4096, 4096, 1024] + - [17, 18658.0] + - - [4096, 1024, 1, 3479, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3542, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3321, 4096, 4096, 4096, 1024] + - [42, 18634.0] + - - [1024, 4096, 1, 3147, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 4096, 1, 3944, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [4096, 1024, 1, 3870, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3308, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3401, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 4096, 1, 3395, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [1024, 4096, 1, 3563, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 33708, 1, 3870, 1024, 1024, 1024, 33708] + - [4, 20871.0] + - - [4096, 1024, 1, 3494, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3271, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [1024, 33708, 1, 3910, 1024, 1024, 1024, 33708] + - [37, 20861.0] + - - [1024, 4096, 1, 3287, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 33708, 1, 3860, 1024, 1024, 1024, 33708] + - [4, 20855.0] + - - [4096, 1024, 1, 3341, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3136, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3439, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [1024, 4096, 1, 3751, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3301, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [4096, 1024, 1, 3468, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3416, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3163, 4096, 4096, 4096, 1024] + - [33, 18617.0] + - - [1024, 4096, 1, 3230, 1024, 1024, 1024, 4096] + - [0, 18610.0] + - - [1024, 4096, 1, 3581, 1024, 1024, 1024, 4096] + - [24, 18604.0] + - - [4096, 1024, 1, 3463, 4096, 4096, 4096, 1024] + - [24, 18612.0] + - - [1024, 4096, 1, 3478, 1024, 1024, 1024, 4096] + - [0, 18613.0] + - - [4096, 1024, 1, 3262, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3438, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3244, 1024, 1024, 1024, 4096] + - [0, 18641.0] + - - [1024, 4096, 1, 3445, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 4096, 1024] + - [42, 18622.0] + - - [1024, 4096, 1, 3492, 1024, 1024, 1024, 4096] + - [0, 18627.0] + - - [4096, 1024, 1, 3211, 4096, 4096, 4096, 1024] + - [42, 18610.0] + - - [1024, 4096, 1, 3910, 1024, 1024, 1024, 4096] + - [0, 18616.0] + - - [1024, 4096, 1, 3314, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3859, 4096, 4096, 4096, 1024] + - [42, 18616.0] + - - [4096, 1024, 1, 3383, 4096, 4096, 4096, 1024] + - [42, 18616.0] + - - [1024, 4096, 1, 3409, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 4020, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3530, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3411, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [1024, 4096, 1, 3566, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3493, 4096, 4096, 4096, 1024] + - [24, 18628.0] + - - [4096, 1024, 1, 3184, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3431, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 3306, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [1024, 4096, 1, 3352, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 3295, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3517, 1024, 1024, 1024, 4096] + - [0, 18644.0] + - - [4096, 1024, 1, 3426, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3385, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3572, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3459, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [1024, 4096, 1, 3374, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3166, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [4096, 1024, 1, 3093, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3523, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3413, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3996, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3452, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3232, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3400, 4096, 4096, 4096, 1024] + - [17, 18646.0] + - - [4096, 1024, 1, 3334, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3345, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3538, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3466, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3315, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [4096, 1024, 1, 3214, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [1024, 33708, 1, 3900, 1024, 1024, 1024, 33708] + - [37, 20876.0] + - - [1024, 4096, 1, 3367, 1024, 1024, 1024, 4096] + - [33, 18622.0] + - - [1024, 4096, 1, 2917, 1024, 1024, 1024, 4096] + - [33, 18623.0] + - - [1024, 4096, 1, 3544, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3414, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3565, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3512, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3191, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3289, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3290, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3211, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 33708, 1, 3969, 1024, 1024, 1024, 33708] + - [37, 20880.0] + - - [4096, 1024, 1, 3566, 4096, 4096, 4096, 1024] + - [24, 18604.0] + - - [1024, 4096, 1, 3459, 1024, 1024, 1024, 4096] + - [33, 18642.0] + - - [1024, 4096, 1, 3372, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3339, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [4096, 1024, 1, 3425, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [4096, 1024, 1, 3388, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3531, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [4096, 1024, 1, 3286, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [4096, 1024, 1, 3462, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3388, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [4096, 1024, 1, 3165, 4096, 4096, 4096, 1024] + - [0, 18607.0] + - - [4096, 1024, 1, 3304, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 2736, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3397, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [1024, 4096, 1, 3311, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3394, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 2736, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3559, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3180, 4096, 4096, 4096, 1024] + - [33, 18614.0] + - - [1024, 4096, 1, 3480, 1024, 1024, 1024, 4096] + - [0, 18642.0] + - - [4096, 1024, 1, 3318, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3213, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [1024, 4096, 1, 3286, 1024, 1024, 1024, 4096] + - [17, 18639.0] + - - [4096, 1024, 1, 3471, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3381, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3502, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [1024, 4096, 1, 3552, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [4096, 1024, 1, 3519, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3300, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 4096, 1, 3419, 1024, 1024, 1024, 4096] + - [17, 18630.0] + - - [4096, 1024, 1, 4030, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3976, 4096, 4096, 4096, 1024] + - [17, 18648.0] + - - [1024, 4096, 1, 3473, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [4096, 1024, 1, 3428, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [1024, 4096, 1, 3433, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3534, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3461, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3681, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3495, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3351, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [1024, 4096, 1, 4059, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3990, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3325, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 4096, 1, 3408, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [4096, 1024, 1, 3394, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3573, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [4096, 1024, 1, 3386, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3540, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3182, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [1024, 4096, 1, 3430, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3236, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 2977, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3355, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3139, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3516, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [4096, 1024, 1, 3368, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3559, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3506, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3145, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3369, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3522, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 33708, 1, 3894, 1024, 1024, 1024, 33708] + - [4, 20875.0] + - - [4096, 1024, 1, 3336, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3382, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3533, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 4050, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3480, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3344, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [1024, 4096, 1, 3509, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [1024, 4096, 1, 3956, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 3616, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [1024, 4096, 1, 3366, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 2935, 4096, 4096, 4096, 1024] + - [0, 18609.0] + - - [4096, 1024, 1, 3393, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3547, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [1024, 4096, 1, 3499, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3357, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3272, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [4096, 1024, 1, 3207, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [4096, 1024, 1, 3894, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 4096, 1, 3444, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [4096, 1024, 1, 3561, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3376, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3458, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [4096, 1024, 1, 3231, 4096, 4096, 4096, 1024] + - [33, 18619.0] + - - [1024, 4096, 1, 3505, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [4096, 1024, 1, 3277, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 4096, 1, 3391, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [1024, 4096, 1, 3536, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [1024, 4096, 1, 3063, 1024, 1024, 1024, 4096] + - [17, 18619.0] + - - [1024, 4096, 1, 3189, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 2505, 1024, 1024, 1024, 4096] + - [17, 18605.0] + - - [4096, 1024, 1, 3454, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3405, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [1024, 33708, 1, 4050, 1024, 1024, 1024, 33708] + - [21, 20879.0] + - - [4096, 1024, 1, 3520, 4096, 4096, 4096, 1024] + - [17, 18641.0] + - - [1024, 4096, 1, 3487, 1024, 1024, 1024, 4096] + - [33, 18601.0] + - - [1024, 4096, 1, 3558, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3297, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3483, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 33708, 1, 3751, 1024, 1024, 1024, 33708] + - [4, 20871.0] + - - [4096, 1024, 1, 3380, 4096, 4096, 4096, 1024] + - [24, 18603.0] + - - [1024, 4096, 1, 3380, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3396, 1024, 1024, 1024, 4096] + - [0, 18627.0] + - - [1024, 4096, 1, 3497, 1024, 1024, 1024, 4096] + - [0, 18628.0] + - - [1024, 4096, 1, 3502, 1024, 1024, 1024, 4096] + - [0, 18631.0] + - - [1024, 4096, 1, 3138, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3939, 4096, 4096, 4096, 1024] + - [33, 18640.0] + - - [1024, 4096, 1, 3303, 1024, 1024, 1024, 4096] + - [17, 18620.0] + - - [1024, 4096, 1, 3418, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3224, 1024, 1024, 1024, 4096] + - [33, 18649.0] + - - [4096, 1024, 1, 3978, 4096, 4096, 4096, 1024] + - [33, 18642.0] + - - [1024, 4096, 1, 3472, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3353, 4096, 4096, 4096, 1024] + - [0, 18624.0] + - - [4096, 1024, 1, 3362, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [1024, 33708, 1, 3978, 1024, 1024, 1024, 33708] + - [4, 20873.0] + - - [1024, 4096, 1, 3432, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3139, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 4096, 1, 3341, 1024, 1024, 1024, 4096] + - [24, 18607.0] + - - [1024, 4096, 1, 3494, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3969, 1024, 1024, 1024, 4096] + - [17, 18643.0] + - - [1024, 4096, 1, 3163, 1024, 1024, 1024, 4096] + - [9, 18598.0] + - - [4096, 1024, 1, 3405, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [4096, 1024, 1, 3453, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [1024, 4096, 1, 3411, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3527, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3474, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3572, 1024, 1024, 1024, 4096] + - [17, 18623.0] + - - [4096, 1024, 1, 3293, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3247, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3425, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [1024, 4096, 1, 3354, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [4096, 1024, 1, 3382, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3236, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3519, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3354, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3501, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3266, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [1024, 4096, 1, 3368, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 4096, 1, 4030, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3533, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3332, 4096, 4096, 4096, 1024] + - [17, 18619.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3616, 1024, 1024, 1024, 4096] + - [0, 18618.0] + - - [4096, 1024, 1, 3265, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3361, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3467, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3454, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3101, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3508, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3267, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3419, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3822, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3266, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [4096, 1024, 1, 3440, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3361, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3546, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3473, 4096, 4096, 4096, 1024] + - [0, 18618.0] + - - [4096, 1024, 1, 3546, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3088, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3535, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 3447, 1024, 1024, 1024, 4096] + - [33, 18652.0] + - - [1024, 4096, 1, 3560, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [1024, 4096, 1, 3422, 1024, 1024, 1024, 4096] + - [0, 18636.0] + - - [1024, 4096, 1, 3469, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3488, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3110, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 3265, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [1024, 4096, 1, 3291, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3390, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3046, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3539, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3221, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [4096, 1024, 1, 3433, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3364, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [4096, 1024, 1, 3470, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3404, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 33708, 1, 3968, 1024, 1024, 1024, 33708] + - [21, 20878.0] + - - [4096, 1024, 1, 3088, 4096, 4096, 4096, 1024] + - [33, 18620.0] + - - [1024, 4096, 1, 3247, 1024, 1024, 1024, 4096] + - [42, 18603.0] + - - [1024, 33708, 1, 3996, 1024, 1024, 1024, 33708] + - [37, 20870.0] + - - [4096, 1024, 1, 3482, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [4096, 1024, 1, 3995, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3280, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [4096, 1024, 1, 3271, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3545, 4096, 4096, 4096, 1024] + - [17, 18651.0] + - - [4096, 1024, 1, 3476, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3496, 4096, 4096, 4096, 1024] + - [24, 18633.0] + - - [4096, 1024, 1, 3191, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [4096, 1024, 1, 3311, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3302, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3681, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3582, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3421, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3560, 4096, 4096, 4096, 1024] + - [17, 18649.0] + - - [1024, 4096, 1, 3495, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 3186, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3925, 4096, 4096, 4096, 1024] + - [0, 18615.0] + - - [1024, 4096, 1, 3435, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3434, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 33708, 1, 4012, 1024, 1024, 1024, 33708] + - [21, 20868.0] + - - [1024, 4096, 1, 3340, 1024, 1024, 1024, 4096] + - [17, 18612.0] + - - [4096, 1024, 1, 3489, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3162, 1024, 1024, 1024, 4096] + - [17, 18622.0] + - - [4096, 1024, 1, 3436, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3574, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [4096, 1024, 1, 3469, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3410, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3216, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3095, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3448, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3176, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 2918, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3424, 1024, 1024, 1024, 4096] + - [0, 18640.0] + - - [4096, 1024, 1, 3402, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3145, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 33708, 1, 3976, 1024, 1024, 1024, 33708] + - [37, 20877.0] + - - [4096, 1024, 1, 3518, 4096, 4096, 4096, 1024] + - [17, 18615.0] + - - [4096, 1024, 1, 3110, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3325, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 33708, 1, 3999, 1024, 1024, 1024, 33708] + - [4, 20870.0] + - - [4096, 1024, 1, 2985, 4096, 4096, 4096, 1024] + - [33, 18619.0] + - - [1024, 4096, 1, 3371, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3342, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [4096, 1024, 1, 3141, 4096, 4096, 4096, 1024] + - [0, 18601.0] + - - [4096, 1024, 1, 3532, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3169, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [1024, 4096, 1, 3514, 1024, 1024, 1024, 4096] + - [17, 18643.0] + - - [4096, 1024, 1, 3780, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [1024, 4096, 1, 3098, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3449, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [1024, 4096, 1, 3222, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [1024, 4096, 1, 3346, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3064, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3511, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3384, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [4096, 1024, 1, 3356, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3796, 1024, 1024, 1024, 4096] + - [17, 18638.0] + - - [4096, 1024, 1, 3427, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3390, 4096, 4096, 4096, 1024] + - [0, 18618.0] + - - [4096, 1024, 1, 3573, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [4096, 1024, 1, 3456, 4096, 4096, 4096, 1024] + - [17, 18650.0] + - - [1024, 4096, 1, 3360, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 33708, 1, 3977, 1024, 1024, 1024, 33708] + - [37, 20865.0] + - - [1024, 4096, 1, 2918, 1024, 1024, 1024, 4096] + - [17, 18578.0] + - - [4096, 1024, 1, 3975, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3525, 4096, 4096, 4096, 1024] + - [42, 18622.0] + - - [4096, 1024, 1, 3398, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3640, 4096, 4096, 4096, 1024] + - [33, 18648.0] + - - [4096, 1024, 1, 3014, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3446, 1024, 1024, 1024, 4096] + - [33, 18601.0] + - - [1024, 33708, 1, 3796, 1024, 1024, 1024, 33708] + - [4, 20852.0] + - - [4096, 1024, 1, 3101, 4096, 4096, 4096, 1024] + - [33, 18585.0] + - - [4096, 1024, 1, 3563, 4096, 4096, 4096, 1024] + - [33, 18609.0] + - - [4096, 1024, 1, 3539, 4096, 4096, 4096, 1024] + - [42, 18607.0] + - - [4096, 1024, 1, 3182, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3468, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [4096, 1024, 1, 3312, 4096, 4096, 4096, 1024] + - [17, 18598.0] + - - [4096, 1024, 1, 3215, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [4096, 1024, 1, 3910, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 33708, 1, 3780, 1024, 1024, 1024, 33708] + - [4, 20864.0] + - - [1024, 4096, 1, 3290, 1024, 1024, 1024, 4096] + - [33, 18588.0] + - - [1024, 4096, 1, 4012, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 4096, 1, 3385, 1024, 1024, 1024, 4096] + - [42, 18607.0] + - - [1024, 33708, 1, 3975, 1024, 1024, 1024, 33708] + - [4, 20876.0] + - - [4096, 1024, 1, 3996, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [4096, 1024, 1, 2765, 4096, 4096, 4096, 1024] + - [17, 18590.0] + - - [4096, 1024, 1, 3538, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [4096, 1024, 1, 3415, 4096, 4096, 4096, 1024] + - [42, 18633.0] + - - [1024, 4096, 1, 3554, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [4096, 1024, 1, 3513, 4096, 4096, 4096, 1024] + - [33, 18595.0] + - - [1024, 4096, 1, 3304, 1024, 1024, 1024, 4096] + - [9, 18613.0] + - - [4096, 1024, 1, 3294, 4096, 4096, 4096, 1024] + - [0, 18617.0] + - - [4096, 1024, 1, 3396, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3213, 1024, 1024, 1024, 4096] + - [24, 18602.0] + - - [4096, 1024, 1, 3137, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [4096, 1024, 1, 3552, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3461, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [4096, 1024, 1, 3263, 4096, 4096, 4096, 1024] + - [42, 18604.0] + - - [4096, 1024, 1, 3430, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [4096, 1024, 1, 3389, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [4096, 1024, 1, 3528, 4096, 4096, 4096, 1024] + - [33, 18610.0] + - - [1024, 4096, 1, 3463, 1024, 1024, 1024, 4096] + - [17, 18602.0] + - - [4096, 1024, 1, 3526, 4096, 4096, 4096, 1024] + - [17, 18643.0] + - - [4096, 1024, 1, 3154, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [4096, 1024, 1, 3499, 4096, 4096, 4096, 1024] + - [24, 18628.0] + - - [4096, 1024, 1, 3955, 4096, 4096, 4096, 1024] + - [17, 18609.0] + - - [1024, 4096, 1, 3297, 1024, 1024, 1024, 4096] + - [17, 18615.0] + - - [1024, 4096, 1, 3233, 1024, 1024, 1024, 4096] + - [17, 18605.0] + - - [1024, 4096, 1, 3226, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 3404, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3355, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3542, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3181, 4096, 4096, 4096, 1024] + - [0, 18583.0] + - - [1024, 4096, 1, 3474, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3319, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3434, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [1024, 4096, 1, 3860, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3343, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [1024, 4096, 1, 3488, 1024, 1024, 1024, 4096] + - [0, 18631.0] + - - [1024, 4096, 1, 3046, 1024, 1024, 1024, 4096] + - [9, 18585.0] + - - [1024, 4096, 1, 3141, 1024, 1024, 1024, 4096] + - [24, 18602.0] + - - [1024, 4096, 1, 3516, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3147, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3421, 1024, 1024, 1024, 4096] + - [17, 18622.0] + - - [4096, 1024, 1, 3944, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [1024, 4096, 1, 3574, 1024, 1024, 1024, 4096] + - [24, 18601.0] + - - [1024, 4096, 1, 3977, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [1024, 4096, 1, 2985, 1024, 1024, 1024, 4096] + - [42, 18599.0] + - - [1024, 4096, 1, 3427, 1024, 1024, 1024, 4096] + - [0, 18634.0] + - - [1024, 4096, 1, 3482, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3332, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3308, 4096, 4096, 4096, 1024] + - [24, 18605.0] + - - [1024, 4096, 1, 3513, 1024, 1024, 1024, 4096] + - [17, 18640.0] + - - [1024, 4096, 1, 3154, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3955, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [1024, 4096, 1, 2967, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 33708, 1, 3942, 1024, 1024, 1024, 33708] + - [4, 20880.0] + - - [1024, 4096, 1, 3319, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [4096, 1024, 1, 3860, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3548, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3977, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [4096, 1024, 1, 3535, 4096, 4096, 4096, 1024] + - [17, 18600.0] + - - [1024, 4096, 1, 3541, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 33708, 1, 3584, 1024, 1024, 1024, 33708] + - [37, 20844.0] + - - [1024, 4096, 1, 3168, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 4096, 1, 3448, 1024, 1024, 1024, 4096] + - [0, 18604.0] + - - [4096, 1024, 1, 3343, 4096, 4096, 4096, 1024] + - [24, 18606.0] + - - [1024, 4096, 1, 3357, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [4096, 1024, 1, 3510, 4096, 4096, 4096, 1024] + - [33, 18614.0] + - - [4096, 1024, 1, 3369, 4096, 4096, 4096, 1024] + - [33, 18589.0] + - - [4096, 1024, 1, 3379, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3276, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3363, 1024, 1024, 1024, 4096] + - [33, 18600.0] + - - [4096, 1024, 1, 3055, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3524, 1024, 1024, 1024, 4096] + - [33, 18604.0] + - - [4096, 1024, 1, 3057, 4096, 4096, 4096, 1024] + - [0, 18599.0] + - - [1024, 33708, 1, 3720, 1024, 1024, 1024, 33708] + - [21, 20847.0] + - - [1024, 4096, 1, 3383, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3522, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 33708, 1, 3956, 1024, 1024, 1024, 33708] + - [37, 20849.0] + - - [1024, 4096, 1, 3481, 1024, 1024, 1024, 4096] + - [17, 18585.0] + - - [4096, 1024, 1, 3562, 4096, 4096, 4096, 1024] + - [33, 18594.0] + - - [4096, 1024, 1, 3299, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [1024, 4096, 1, 3262, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [1024, 33708, 1, 4026, 1024, 1024, 1024, 33708] + - [37, 20845.0] + - - [4096, 1024, 1, 3168, 4096, 4096, 4096, 1024] + - [33, 18588.0] + - - [1024, 4096, 1, 3999, 1024, 1024, 1024, 4096] + - [17, 18596.0] + - - [1024, 4096, 1, 3549, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3375, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3496, 1024, 1024, 1024, 4096] + - [33, 18608.0] + - - [1024, 4096, 1, 3190, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3273, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [1024, 4096, 1, 3406, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 4005, 4096, 4096, 4096, 1024] + - [17, 18605.0] + - - [4096, 1024, 1, 3555, 4096, 4096, 4096, 1024] + - [33, 18605.0] + - - [4096, 1024, 1, 2505, 4096, 4096, 4096, 1024] + - [33, 18576.0] + - - [1024, 4096, 1, 3460, 1024, 1024, 1024, 4096] + - [33, 18607.0] + - - [1024, 4096, 1, 3579, 1024, 1024, 1024, 4096] + - [24, 18608.0] + - - [1024, 33708, 1, 4030, 1024, 1024, 1024, 33708] + - [4, 20866.0] + - - [1024, 4096, 1, 3510, 1024, 1024, 1024, 4096] + - [17, 18588.0] + - - [1024, 4096, 1, 3282, 1024, 1024, 1024, 4096] + - [17, 18591.0] + - - [1024, 4096, 1, 3377, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 2935, 1024, 1024, 1024, 4096] + - [33, 18595.0] + - - [1024, 4096, 1, 3498, 1024, 1024, 1024, 4096] + - [0, 18639.0] + - - [1024, 4096, 1, 3593, 1024, 1024, 1024, 4096] + - [0, 18604.0] + - - [4096, 1024, 1, 3226, 4096, 4096, 4096, 1024] + - [17, 18593.0] + - - [1024, 4096, 1, 2499, 1024, 1024, 1024, 4096] + - [42, 18585.0] + - - [1024, 4096, 1, 3296, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3455, 1024, 1024, 1024, 4096] + - [17, 18603.0] + - - [1024, 4096, 1, 3399, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [1024, 4096, 1, 3205, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [4096, 1024, 1, 4026, 4096, 4096, 4096, 1024] + - [33, 18606.0] + - - [1024, 4096, 1, 3484, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3302, 4096, 4096, 4096, 1024] + - [17, 18617.0] + - - [1024, 4096, 1, 3485, 1024, 1024, 1024, 4096] + - [17, 18594.0] + - - [1024, 4096, 1, 3126, 1024, 1024, 1024, 4096] + - [42, 18620.0] + - - [1024, 4096, 1, 4050, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3235, 4096, 4096, 4096, 1024] + - [33, 18617.0] + - - [1024, 33708, 1, 3955, 1024, 1024, 1024, 33708] + - [4, 20875.0] + - - [1024, 4096, 1, 3342, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 3397, 1024, 1024, 1024, 4096] + - [0, 18625.0] + - - [4096, 1024, 1, 3491, 4096, 4096, 4096, 1024] + - [24, 18618.0] + - - [1024, 4096, 1, 3503, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3140, 1024, 1024, 1024, 4096] + - [17, 18591.0] + - - [4096, 1024, 1, 3121, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [4096, 1024, 1, 3276, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [1024, 4096, 1, 3321, 1024, 1024, 1024, 4096] + - [17, 18592.0] + - - [1024, 4096, 1, 3870, 1024, 1024, 1024, 4096] + - [17, 18600.0] + - - [4096, 1024, 1, 3475, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 2984, 1024, 1024, 1024, 4096] + - [0, 18602.0] + - - [4096, 1024, 1, 3363, 4096, 4096, 4096, 1024] + - [17, 18594.0] + - - [1024, 4096, 1, 3582, 1024, 1024, 1024, 4096] + - [42, 18606.0] + - - [4096, 1024, 1, 3509, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3426, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [4096, 1024, 1, 3136, 4096, 4096, 4096, 1024] + - [33, 18611.0] + - - [1024, 4096, 1, 3232, 1024, 1024, 1024, 4096] + - [33, 18606.0] + - - [4096, 1024, 1, 3103, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [1024, 4096, 1, 3335, 1024, 1024, 1024, 4096] + - [0, 18620.0] + - - [1024, 4096, 1, 3900, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3512, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3222, 4096, 4096, 4096, 1024] + - [24, 18609.0] + - - [1024, 4096, 1, 3165, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3408, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3751, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3318, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3442, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3413, 1024, 1024, 1024, 4096] + - [0, 18603.0] + - - [4096, 1024, 1, 3524, 4096, 4096, 4096, 1024] + - [0, 18614.0] + - - [1024, 4096, 1, 3976, 1024, 1024, 1024, 4096] + - [33, 18651.0] + - - [1024, 4096, 1, 3475, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [1024, 4096, 1, 3534, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3301, 4096, 4096, 4096, 1024] + - [42, 18623.0] + - - [4096, 1024, 1, 3248, 4096, 4096, 4096, 1024] + - [17, 18642.0] + - - [1024, 4096, 1, 2977, 1024, 1024, 1024, 4096] + - [33, 18587.0] + - - [4096, 1024, 1, 3346, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3451, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3257, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 4096, 1, 3356, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3348, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 3335, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3505, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3490, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3447, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [1024, 4096, 1, 3267, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [4096, 1024, 1, 3230, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3455, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3925, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [1024, 4096, 1, 3362, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3969, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [4096, 1024, 1, 3527, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [1024, 4096, 1, 3585, 1024, 1024, 1024, 4096] + - [9, 18597.0] + - - [4096, 1024, 1, 3063, 4096, 4096, 4096, 1024] + - [0, 18602.0] + - - [4096, 1024, 1, 3435, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [4096, 1024, 1, 3366, 4096, 4096, 4096, 1024] + - [0, 18613.0] + - - [4096, 1024, 1, 3581, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [1024, 33708, 1, 3906, 1024, 1024, 1024, 33708] + - [4, 20873.0] + - - [1024, 4096, 1, 3464, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3440, 1024, 1024, 1024, 4096] + - [3, 18610.0] + - - [4096, 1024, 1, 3143, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3349, 1024, 1024, 1024, 4096] + - [42, 18608.0] + - - [4096, 1024, 1, 3416, 4096, 4096, 4096, 1024] + - [33, 18653.0] + - - [4096, 1024, 1, 3365, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [1024, 4096, 1, 3470, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3287, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3441, 1024, 1024, 1024, 4096] + - [0, 18614.0] + - - [4096, 1024, 1, 3224, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3387, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [1024, 4096, 1, 3547, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3478, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3548, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 33708, 1, 4020, 1024, 1024, 1024, 33708] + - [4, 20880.0] + - - [4096, 1024, 1, 3320, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [1024, 4096, 1, 3906, 1024, 1024, 1024, 4096] + - [9, 18606.0] + - - [4096, 1024, 1, 3796, 4096, 4096, 4096, 1024] + - [33, 18615.0] + - - [1024, 4096, 1, 3306, 1024, 1024, 1024, 4096] + - [17, 18602.0] + - - [1024, 4096, 1, 3401, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [1024, 4096, 1, 3215, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [4096, 1024, 1, 4012, 4096, 4096, 4096, 1024] + - [0, 18610.0] + - - [1024, 4096, 1, 2765, 1024, 1024, 1024, 4096] + - [17, 18609.0] + - - [4096, 1024, 1, 3554, 4096, 4096, 4096, 1024] + - [33, 18602.0] + - - [4096, 1024, 1, 3423, 4096, 4096, 4096, 1024] + - [24, 18606.0] + - - [1024, 4096, 1, 3562, 1024, 1024, 1024, 4096] + - [33, 18597.0] + - - [1024, 4096, 1, 3489, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [4096, 1024, 1, 3358, 4096, 4096, 4096, 1024] + - [42, 18621.0] + - - [4096, 1024, 1, 3270, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3293, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [1024, 4096, 1, 3376, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3245, 4096, 4096, 4096, 1024] + - [42, 18607.0] + - - [4096, 1024, 1, 3541, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3443, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3438, 4096, 4096, 4096, 1024] + - [17, 18603.0] + - - [4096, 1024, 1, 3244, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3365, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3299, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 3471, 1024, 1024, 1024, 4096] + - [33, 18599.0] + - - [1024, 4096, 1, 3398, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3162, 4096, 4096, 4096, 1024] + - [33, 18592.0] + - - [1024, 4096, 1, 4005, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3579, 4096, 4096, 4096, 1024] + - [24, 18620.0] + - - [1024, 4096, 1, 3121, 1024, 1024, 1024, 4096] + - [33, 18589.0] + - - [4096, 1024, 1, 3441, 4096, 4096, 4096, 1024] + - [33, 18600.0] + - - [4096, 1024, 1, 3422, 4096, 4096, 4096, 1024] + - [24, 18615.0] + - - [4096, 1024, 1, 3444, 4096, 4096, 4096, 1024] + - [24, 18614.0] + - - [1024, 4096, 1, 3337, 1024, 1024, 1024, 4096] + - [24, 18598.0] + - - [4096, 1024, 1, 3550, 4096, 4096, 4096, 1024] + - [42, 18609.0] + - - [1024, 4096, 1, 3477, 1024, 1024, 1024, 4096] + - [42, 18612.0] + - - [4096, 1024, 1, 3490, 4096, 4096, 4096, 1024] + - [24, 18617.0] + - - [4096, 1024, 1, 3585, 4096, 4096, 4096, 1024] + - [0, 18616.0] + - - [1024, 4096, 1, 3143, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 33708, 1, 3876, 1024, 1024, 1024, 33708] + - [4, 20879.0] + - - [1024, 4096, 1, 3320, 1024, 1024, 1024, 4096] + - [33, 18595.0] + - - [1024, 4096, 1, 3423, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [1024, 4096, 1, 3894, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3410, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [1024, 4096, 1, 3561, 1024, 1024, 1024, 4096] + - [33, 18597.0] + - - [4096, 1024, 1, 3492, 4096, 4096, 4096, 1024] + - [17, 18592.0] + - - [36548, 1024, 1, 3712, 36548, 36548, 36548, 1024] + - [4, 20844.0] + - - [4096, 2048, 1, 128, 4096, 4096, 4096, 2048] + - [17, 18506.0] + - - [4096, 3072, 1, 128, 4096, 4096, 4096, 3072] + - [33, 19188.0] + - - [768, 3072, 1, 4096, 768, 768, 768, 3072] + - [5, 19946.0] + - - [768, 30522, 1, 1280, 768, 768, 768, 30522] + - [37, 20670.0] + - - [768, 30522, 1, 320, 768, 768, 768, 30522] + - [34, 20228.0] + - - [768, 30522, 1, 640, 768, 768, 768, 30522] + - [37, 20506.0] + - - [256, 512, 36, 98, 256, 256, 256, 512] + - [33, 17358.0] + - - [256, 256, 64, 56, 256, 256, 256, 256] + - [9, 15534.0] + - - [512, 486, 36, 800, 512, 512, 512, 486] + - [1, 19191.0] + - - [512, 512, 36, 1568, 512, 512, 512, 512] + - [21, 20527.0] + - - [256, 384, 36, 4096, 256, 256, 256, 384] + - [22, 20040.0] + - - [128, 256, 64, 32, 128, 128, 128, 256] + - [19, 7661.0] + - - [128, 256, 64, 9, 128, 128, 128, 256] + - [36, 2904.0] + - - [256, 512, 36, 784, 256, 256, 256, 512] + - [4, 19977.0] + - - [256, 324, 36, 32, 256, 256, 256, 324] + - [0, 9691.0] + - - [512, 512, 36, 33, 512, 512, 512, 512] + - [33, 13671.0] + - - [192, 384, 64, 128, 192, 192, 192, 384] + - [0, 13182.0] + - - [512, 512, 64, 72, 512, 512, 512, 512] + - [9, 18993.0] + - - [512, 512, 36, 128, 512, 512, 512, 512] + - [22, 19309.0] + - - [192, 384, 64, 2304, 192, 192, 192, 384] + - [37, 14765.0] + - - [384, 256, 64, 450, 384, 384, 384, 256] + - [1, 19179.0] + - - [384, 256, 64, 2304, 384, 384, 384, 256] + - [37, 19761.0] + - - [512, 512, 64, 144, 512, 512, 512, 512] + - [33, 19399.0] + - - [256, 256, 36, 6272, 256, 256, 256, 256] + - [38, 19884.0] + - - [256, 384, 64, 2304, 256, 256, 256, 384] + - [21, 19761.0] + - - [512, 512, 36, 66, 512, 512, 512, 512] + - [33, 17273.0] + - - [128, 256, 64, 800, 128, 128, 128, 256] + - [22, 17397.0] + - - [192, 256, 36, 512, 192, 192, 192, 256] + - [38, 14160.0] + - - [256, 512, 64, 200, 256, 256, 256, 512] + - [0, 19026.0] + - - [256, 512, 64, 25, 256, 256, 256, 512] + - [35, 10744.0] + - - [128, 256, 36, 1568, 128, 128, 128, 256] + - [35, 18744.0] + - - [128, 256, 64, 288, 128, 128, 128, 256] + - [5, 16740.0] + - - [256, 384, 64, 1152, 256, 256, 256, 384] + - [4, 19583.0] + - - [160, 320, 64, 288, 160, 160, 160, 320] + - [3, 11335.0] + - - [128, 256, 36, 128, 128, 128, 128, 256] + - [0, 14435.0] + - - [512, 512, 36, 16, 512, 512, 512, 512] + - [9, 7634.0] + - - [384, 256, 36, 800, 384, 384, 384, 256] + - [22, 19845.0] + - - [192, 384, 36, 4096, 192, 192, 192, 384] + - [5, 15022.0] + - - [256, 384, 64, 576, 256, 256, 256, 384] + - [38, 19418.0] + - - [512, 512, 64, 14, 512, 512, 512, 512] + - [3, 6848.0] + - - [512, 512, 36, 8, 512, 512, 512, 512] + - [33, 3936.0] + - - [512, 486, 64, 128, 512, 512, 512, 486] + - [0, 18090.0] + - - [256, 256, 36, 128, 256, 256, 256, 256] + - [0, 17100.0] + - - [256, 256, 36, 32, 256, 256, 256, 256] + - [0, 10066.0] + - - [192, 256, 64, 288, 192, 192, 192, 256] + - [0, 13450.0] + - - [256, 256, 36, 16, 256, 256, 256, 256] + - [0, 6011.0] + - - [128, 256, 36, 3200, 128, 128, 128, 256] + - [30, 18809.0] + - - [160, 320, 64, 512, 160, 160, 160, 320] + - [17, 11926.0] + - - [160, 320, 36, 512, 160, 160, 160, 320] + - [36, 11631.0] + - - [256, 512, 36, 4, 256, 256, 256, 512] + - [45, 1958.0] + - - [256, 324, 64, 1568, 256, 256, 256, 324] + - [21, 16521.0] + - - [256, 256, 36, 3200, 256, 256, 256, 256] + - [5, 19747.0] + - - [256, 256, 36, 210, 256, 256, 256, 256] + - [17, 17299.0] + - - [192, 384, 64, 576, 192, 192, 192, 384] + - [1, 14547.0] + - - [512, 512, 64, 800, 512, 512, 512, 512] + - [4, 20281.0] + - - [256, 256, 64, 1152, 256, 256, 256, 256] + - [20, 18325.0] + - - [512, 486, 64, 512, 512, 512, 512, 486] + - [1, 18891.0] + - - [256, 512, 64, 1600, 256, 256, 256, 512] + - [21, 19441.0] + - - [512, 512, 64, 9, 512, 512, 512, 512] + - [3, 4518.0] + - - [256, 512, 36, 1568, 256, 256, 256, 512] + - [5, 20048.0] + - - [128, 256, 64, 3200, 128, 128, 128, 256] + - [12, 17154.0] + - - [256, 512, 64, 4, 256, 256, 256, 512] + - [33, 2110.0] + - - [256, 256, 64, 450, 256, 256, 256, 256] + - [33, 18254.0] + - - [256, 256, 64, 72, 256, 256, 256, 256] + - [0, 16648.0] + - - [128, 256, 36, 3136, 128, 128, 128, 256] + - [38, 18780.0] + - - [160, 320, 64, 242, 160, 160, 160, 320] + - [3, 11103.0] + - - [512, 512, 36, 512, 512, 512, 512, 512] + - [4, 20331.0] + - - [512, 512, 36, 256, 512, 512, 512, 512] + - [1, 19914.0] + - - [512, 512, 36, 1024, 512, 512, 512, 512] + - [4, 20447.0] + - - [256, 256, 36, 4096, 256, 256, 256, 256] + - [38, 19838.0] + - - [256, 256, 64, 896, 256, 256, 256, 256] + - [17, 18282.0] + - - [128, 256, 64, 242, 128, 128, 128, 256] + - [47, 16030.0] + - - [192, 384, 36, 1024, 192, 192, 192, 384] + - [5, 14792.0] + - - [128, 256, 64, 100, 128, 128, 128, 256] + - [17, 14604.0] + - - [384, 256, 64, 1152, 384, 384, 384, 256] + - [21, 19600.0] + - - [192, 384, 36, 128, 192, 192, 192, 384] + - [9, 12030.0] + - - [128, 256, 64, 1568, 128, 128, 128, 256] + - [22, 17525.0] + - - [128, 256, 64, 72, 128, 128, 128, 256] + - [9, 13222.0] + - - [256, 256, 36, 12544, 256, 256, 256, 256] + - [6, 20028.0] + - - [256, 256, 36, 105, 256, 256, 256, 256] + - [9, 15639.0] + - - [128, 256, 36, 392, 128, 128, 128, 256] + - [35, 17255.0] + - - [384, 256, 36, 1024, 384, 384, 384, 256] + - [5, 19900.0] + - - [128, 256, 64, 1152, 128, 128, 128, 256] + - [38, 17456.0] + - - [256, 324, 64, 32, 256, 256, 256, 324] + - [9, 10684.0] + - - [256, 384, 36, 800, 256, 256, 256, 384] + - [38, 19768.0] + - - [512, 512, 64, 4, 512, 512, 512, 512] + - [9, 2075.0] + - - [192, 320, 36, 128, 192, 192, 192, 320] + - [9, 12245.0] + - - [192, 384, 64, 242, 192, 192, 192, 384] + - [38, 13939.0] + - - [256, 486, 64, 32, 256, 256, 256, 486] + - [33, 10283.0] + - - [512, 512, 64, 64, 512, 512, 512, 512] + - [0, 18944.0] + - - [128, 256, 36, 512, 128, 128, 128, 256] + - [5, 17743.0] + - - [512, 512, 64, 576, 512, 512, 512, 512] + - [4, 20185.0] + - - [256, 256, 64, 9, 256, 256, 256, 256] + - [3, 3651.0] + - - [128, 256, 36, 12544, 128, 128, 128, 256] + - [35, 18530.0] + - - [256, 512, 36, 3136, 256, 256, 256, 512] + - [4, 20215.0] + - - [144, 288, 36, 512, 144, 144, 144, 288] + - [3, 9263.0] + - - [384, 384, 36, 800, 384, 384, 384, 384] + - [5, 20079.0] + - - [512, 512, 64, 1600, 512, 512, 512, 512] + - [37, 20436.0] + - - [512, 512, 36, 4, 512, 512, 512, 512] + - [3, 1952.0] + - - [192, 384, 64, 450, 192, 192, 192, 384] + - [38, 14330.0] + - - [256, 256, 36, 1024, 256, 256, 256, 256] + - [38, 19508.0] + - - [256, 512, 64, 400, 256, 256, 256, 512] + - [33, 19240.0] + - - [128, 256, 36, 6272, 128, 128, 128, 256] + - [2, 18244.0] + - - [256, 256, 36, 512, 256, 256, 256, 256] + - [5, 18775.0] + - - [256, 256, 64, 112, 256, 256, 256, 256] + - [33, 17411.0] + - - [512, 512, 64, 18, 512, 512, 512, 512] + - [13, 8473.0] + - - [256, 256, 64, 18, 256, 256, 256, 256] + - [0, 6765.0] + - - [256, 256, 64, 1568, 256, 256, 256, 256] + - [3, 18409.0] + - - [384, 256, 36, 4096, 384, 384, 384, 256] + - [5, 20076.0] + - - [256, 512, 64, 800, 256, 256, 256, 512] + - [20, 19256.0] + - - [256, 384, 36, 2048, 256, 256, 256, 384] + - [47, 19890.0] + - - [384, 384, 64, 2304, 384, 384, 384, 384] + - [37, 20583.0] + - - [160, 320, 64, 128, 160, 160, 160, 320] + - [17, 10613.0] + - - [512, 512, 36, 528, 512, 512, 512, 512] + - [1, 20346.0] + - - [160, 320, 36, 128, 160, 160, 160, 320] + - [0, 10666.0] + - - [256, 512, 36, 49, 256, 256, 256, 512] + - [9, 15072.0] + - - [384, 384, 64, 450, 384, 384, 384, 384] + - [1, 20167.0] + - - [256, 256, 64, 3200, 256, 256, 256, 256] + - [3, 18547.0] + - - [512, 512, 64, 8, 512, 512, 512, 512] + - [9, 4048.0] + - - [512, 512, 64, 288, 512, 512, 512, 512] + - [1, 19850.0] + - - [384, 384, 36, 1024, 384, 384, 384, 384] + - [5, 20078.0] + - - [128, 256, 36, 16, 128, 128, 128, 256] + - [17, 3932.0] + - - [256, 256, 64, 288, 256, 256, 256, 256] + - [17, 18132.0] + - - [256, 384, 36, 1024, 256, 256, 256, 384] + - [22, 19879.0] + - - [256, 324, 36, 3200, 256, 256, 256, 324] + - [5, 16853.0] + - - [192, 384, 64, 512, 192, 192, 192, 384] + - [22, 14392.0] + - - [128, 256, 64, 1600, 128, 128, 128, 256] + - [22, 17520.0] + - - [512, 512, 36, 32, 512, 512, 512, 512] + - [9, 13187.0] + - - [512, 512, 36, 3136, 512, 512, 512, 512] + - [21, 20666.0] + - - [128, 256, 64, 6400, 128, 128, 128, 256] + - [27, 17312.0] + - - [256, 256, 36, 2048, 256, 256, 256, 256] + - [5, 19719.0] + - - [256, 256, 64, 6400, 256, 256, 256, 256] + - [36, 18573.0] + - - [256, 256, 36, 1680, 256, 256, 256, 256] + - [22, 19694.0] + - - [192, 384, 36, 2048, 192, 192, 192, 384] + - [5, 14914.0] + - - [256, 256, 64, 144, 256, 256, 256, 256] + - [33, 17466.0] + - - [384, 384, 36, 4096, 384, 384, 384, 384] + - [38, 20278.0] + - - [160, 320, 64, 1152, 160, 160, 160, 320] + - [3, 11993.0] + - - [384, 256, 36, 2048, 384, 384, 384, 256] + - [38, 19945.0] + - - [256, 512, 36, 392, 256, 256, 256, 512] + - [22, 19381.0] + - - [256, 512, 64, 50, 256, 256, 256, 512] + - [0, 16346.0] + - - [384, 384, 36, 2048, 384, 384, 384, 384] + - [5, 20191.0] + - - [256, 384, 64, 450, 256, 256, 256, 384] + - [1, 19184.0] + - - [192, 320, 64, 128, 192, 192, 192, 320] + - [17, 12866.0] + - - [128, 256, 36, 32, 128, 128, 128, 256] + - [19, 7316.0] + - - [512, 512, 64, 256, 512, 512, 512, 512] + - [1, 19945.0] + - - [256, 512, 64, 32, 256, 256, 256, 512] + - [33, 13184.0] + - - [384, 384, 64, 576, 384, 384, 384, 384] + - [4, 20206.0] + - - [512, 486, 36, 288, 512, 512, 512, 486] + - [22, 18271.0] + - - [144, 288, 64, 242, 144, 144, 144, 288] + - [13, 9132.0] + - - [384, 256, 64, 576, 384, 384, 384, 256] + - [18, 19563.0] + - - [512, 512, 36, 64, 512, 512, 512, 512] + - [33, 18269.0] + - - [448, 384, 64, 128, 448, 448, 448, 384] + - [17, 16429.0] + - - [144, 288, 64, 288, 144, 144, 144, 288] + - [13, 9320.0] + - - [512, 512, 64, 224, 512, 512, 512, 512] + - [10, 19897.0] + - - [384, 384, 64, 1152, 384, 384, 384, 384] + - [21, 20465.0] + - - [448, 384, 36, 128, 448, 448, 448, 384] + - [33, 15611.0] + - - [256, 486, 36, 128, 256, 256, 256, 486] + - [0, 16503.0] + - - [256, 256, 36, 800, 256, 256, 256, 256] + - [5, 19434.0] + - - [192, 384, 36, 800, 192, 192, 192, 384] + - [5, 14727.0] + - - [256, 256, 36, 256, 256, 256, 256, 256] + - [0, 18127.0] + - - [192, 384, 64, 1152, 192, 192, 192, 384] + - [5, 14617.0] + - - [128, 256, 64, 200, 128, 128, 128, 256] + - [33, 15780.0] + - - [512, 512, 64, 28, 512, 512, 512, 512] + - [26, 12447.0] + - - [144, 288, 64, 1152, 144, 144, 144, 288] + - [3, 9720.0] + - - [256, 256, 64, 576, 256, 256, 256, 256] + - [33, 18327.0] + - - [256, 256, 64, 2304, 256, 256, 256, 256] + - [36, 18495.0] + - - [192, 384, 36, 512, 192, 192, 192, 384] + - [5, 14350.0] + - - [256, 512, 36, 32, 256, 256, 256, 512] + - [33, 12731.0] + - - [512, 512, 64, 128, 512, 512, 512, 512] + - [1, 19469.0] + - - [512, 512, 64, 32, 512, 512, 512, 512] + - [9, 15252.0] + - - [128, 256, 36, 196, 128, 128, 128, 256] + - [0, 15435.0] + - - [196, 528, 32, 32, 196, 196, 196, 528] + - [33, 6353.0] + - - [196, 512, 32, 24, 196, 196, 196, 512] + - [0, 5070.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [42, 12402.0] + - - [1001, 1536, 1, 32, 1001, 1001, 1001, 1536] + - [19, 6853.0] + - - [196, 480, 32, 64, 196, 196, 196, 480] + - [33, 9263.0] + - - [289, 1024, 32, 384, 289, 289, 289, 1024] + - [1, 14734.0] + - - [784, 192, 32, 96, 784, 784, 784, 192] + - [9, 15033.0] + - - [50176, 256, 1, 128, 50176, 50176, 50176, 256] + - [38, 19262.0] + - - [289, 1024, 32, 256, 289, 289, 289, 1024] + - [33, 14627.0] + - - [289, 1024, 32, 192, 289, 289, 289, 1024] + - [33, 14541.0] + - - [12544, 512, 1, 256, 12544, 12544, 12544, 512] + - [38, 19527.0] + - - [1225, 1728, 1, 192, 1225, 1225, 1225, 1728] + - [33, 15802.0] + - - [196, 480, 32, 96, 196, 196, 196, 480] + - [0, 10816.0] + - - [196, 512, 32, 144, 196, 196, 196, 512] + - [0, 13041.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [17, 13935.0] + - - [5329, 576, 1, 96, 5329, 5329, 5329, 576] + - [9, 16191.0] + - - [196, 528, 32, 128, 196, 196, 196, 528] + - [9, 11914.0] + - - [5329, 448, 1, 64, 5329, 5329, 5329, 448] + - [9, 12754.0] + - - [784, 256, 32, 64, 784, 784, 784, 256] + - [9, 15224.0] + - - [784, 192, 32, 32, 784, 784, 784, 192] + - [0, 11731.0] + - - [21609, 288, 1, 32, 21609, 21609, 21609, 288] + - [33, 11315.0] + - - [784, 256, 32, 32, 784, 784, 784, 256] + - [24, 12718.0] + - - [5041, 720, 1, 192, 5041, 5041, 5041, 720] + - [0, 16343.0] + - - [196, 512, 32, 128, 196, 196, 196, 512] + - [0, 12687.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [33, 14054.0] + - - [1001, 4096, 1, 512, 1001, 1001, 1001, 4096] + - [0, 17948.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [0, 16429.0] + - - [784, 192, 32, 16, 784, 784, 784, 192] + - [17, 6644.0] + - - [3136, 1024, 1, 2048, 3136, 3136, 3136, 1024] + - [22, 18201.0] + - - [784, 256, 32, 128, 784, 784, 784, 256] + - [5, 16013.0] + - - [196, 512, 32, 32, 196, 196, 196, 512] + - [33, 6439.0] + - - [1225, 384, 32, 96, 1225, 1225, 1225, 384] + - [0, 18163.0] + - - [5041, 576, 1, 96, 5041, 5041, 5041, 576] + - [17, 15838.0] + - - [5329, 160, 32, 64, 5329, 5329, 5329, 160] + - [0, 13447.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [9, 13367.0] + - - [4096, 9216, 1, 512, 4096, 4096, 4096, 9216] + - [21, 20721.0] + - - [196, 480, 32, 192, 196, 196, 196, 480] + - [33, 12436.0] + - - [3136, 1024, 1, 512, 3136, 3136, 3136, 1024] + - [5, 17604.0] + - - [784, 192, 32, 64, 784, 784, 784, 192] + - [17, 14638.0] + - - [289, 1024, 32, 128, 289, 289, 289, 1024] + - [0, 14167.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [33, 14120.0] + - - [196, 512, 32, 112, 196, 196, 196, 512] + - [0, 12567.0] + - - [1001, 2048, 1, 32, 1001, 1001, 1001, 2048] + - [33, 7885.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [9, 14941.0] + - - [1225, 384, 32, 192, 1225, 1225, 1225, 384] + - [38, 18777.0] + - - [50176, 256, 1, 512, 50176, 50176, 50176, 256] + - [4, 20134.0] + - - [196, 512, 32, 160, 196, 196, 196, 512] + - [9, 12923.0] + - - [4096, 4096, 1, 512, 4096, 4096, 4096, 4096] + - [37, 20256.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [9, 16937.0] + - - [196, 480, 32, 16, 196, 196, 196, 480] + - [33, 3431.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [0, 15845.0] + - - [1225, 1200, 1, 64, 1225, 1225, 1225, 1200] + - [33, 10453.0] + - - [1225, 384, 32, 64, 1225, 1225, 1225, 384] + - [0, 17503.0] + - - [12544, 512, 1, 1024, 12544, 12544, 12544, 512] + - [1, 20088.0] + - - [196, 512, 32, 64, 196, 196, 196, 512] + - [33, 9843.0] + - - [196, 528, 32, 256, 196, 196, 196, 528] + - [0, 12706.0] + - - [196, 528, 32, 160, 196, 196, 196, 528] + - [0, 12260.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [0, 15465.0] + - - [1001, 2048, 1, 64, 1001, 1001, 1001, 2048] + - [33, 11993.0] + - - [289, 768, 128, 128, 289, 289, 289, 768] + - [0, 14649.0] + - - [1225, 192, 128, 64, 1225, 1225, 1225, 192] + - [40, 8201.0] + - - [1225, 288, 128, 48, 1225, 1225, 1225, 288] + - [16, 6826.0] + - - [289, 768, 128, 192, 289, 289, 289, 768] + - [17, 14734.0] + - - [289, 768, 128, 160, 289, 289, 289, 768] + - [0, 14467.0] + - - [1225, 256, 128, 48, 1225, 1225, 1225, 256] + - [0, 6901.0] + - - [1225, 192, 128, 48, 1225, 1225, 1225, 192] + - [40, 7408.0] + - - [1225, 288, 128, 64, 1225, 1225, 1225, 288] + - [8, 8433.0] + - - [1225, 256, 128, 64, 1225, 1225, 1225, 256] + - [7, 7484.0] + - - [1001, 2048, 1, 128, 1001, 1001, 1001, 2048] + - [17, 14230.0] + - - [1225, 192, 128, 32, 1225, 1225, 1225, 192] + - [45, 7979.0] + - - [1001, 1536, 1, 64, 1001, 1001, 1001, 1536] + - [26, 9498.0] + - - [1024, 4096, 1, 64, 1024, 1024, 1024, 4096] + - [9, 16151.0] + - - [1024, 4096, 1, 6336, 1024, 1024, 1024, 4096] + - [33, 18654.0] + - - [512, 33708, 1, 3780, 512, 512, 512, 33708] + - [4, 20383.0] + - - [512, 33708, 1, 3968, 512, 512, 512, 33708] + - [4, 20394.0] + - - [512, 33708, 1, 4030, 512, 512, 512, 33708] + - [37, 20379.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [17, 14086.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [17, 14886.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [17, 14378.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [17, 14508.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [17, 14541.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [9, 16841.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [0, 17091.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [9, 12811.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [9, 16679.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [17, 17497.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [9, 16949.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [9, 17801.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [33, 14548.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [0, 15595.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 13888.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [21, 20128.0] + - - [512, 33708, 1, 3796, 512, 512, 512, 33708] + - [21, 20344.0] + - - [512, 33708, 1, 3822, 512, 512, 512, 33708] + - [21, 20360.0] + - - [512, 33708, 1, 3840, 512, 512, 512, 33708] + - [37, 20351.0] + - - [512, 33708, 1, 3859, 512, 512, 512, 33708] + - [21, 20363.0] + - - [512, 33708, 1, 3870, 512, 512, 512, 33708] + - [4, 20338.0] + - - [512, 33708, 1, 3876, 512, 512, 512, 33708] + - [4, 20343.0] + - - [512, 33708, 1, 3906, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 3910, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3925, 512, 512, 512, 33708] + - [21, 20345.0] + - - [512, 33708, 1, 3942, 512, 512, 512, 33708] + - [4, 20341.0] + - - [512, 33708, 1, 3944, 512, 512, 512, 33708] + - [4, 20347.0] + - - [512, 33708, 1, 3955, 512, 512, 512, 33708] + - [4, 20344.0] + - - [512, 33708, 1, 3969, 512, 512, 512, 33708] + - [4, 20347.0] + - - [512, 33708, 1, 3976, 512, 512, 512, 33708] + - [4, 20340.0] + - - [512, 33708, 1, 3977, 512, 512, 512, 33708] + - [4, 20344.0] + - - [512, 33708, 1, 3978, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3990, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3995, 512, 512, 512, 33708] + - [21, 20346.0] + - - [512, 33708, 1, 3996, 512, 512, 512, 33708] + - [4, 20343.0] + - - [512, 33708, 1, 3999, 512, 512, 512, 33708] + - [4, 20342.0] + - - [512, 33708, 1, 4005, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 4012, 512, 512, 512, 33708] + - [21, 20342.0] + - - [512, 33708, 1, 4020, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 4026, 512, 512, 512, 33708] + - [37, 20339.0] + - - [512, 33708, 1, 4032, 512, 512, 512, 33708] + - [4, 20351.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [17, 18739.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [33, 18832.0] + - - [1024, 30522, 1, 20, 1024, 1024, 1024, 30522] + - [3, 10151.0] + - - [1024, 30522, 1, 80, 1024, 1024, 1024, 30522] + - [9, 18774.0] + - - [1024, 30522, 1, 120, 1024, 1024, 1024, 30522] + - [9, 19163.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [17, 18606.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [33, 18608.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [17, 18613.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [37, 20839.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [21, 20841.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [37, 20847.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [0, 18599.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [36, 18591.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [3, 18606.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [20, 18605.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [36, 18600.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [4, 20775.0] + - - [7744, 7744, 1, 7744, 7744, 7744, 7744, 7744] + - [4, 20634.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 1152] + - [17, 15546.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 1536] + - [38, 19113.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 1920] + - [33, 18365.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 2304] + - [22, 19741.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 2688] + - [17, 19106.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 3072] + - [10, 20137.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 3456] + - [37, 19717.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 3840] + - [38, 20201.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 4224] + - [4, 20092.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 4608] + - [4, 20574.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 4992] + - [37, 20335.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 5376] + - [38, 20308.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 5760] + - [34, 20427.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 6144] + - [4, 20562.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 6528] + - [21, 20539.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 6912] + - [37, 20530.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 7296] + - [21, 20627.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 7680] + - [37, 20716.0] + - - [1536, 768, 1, 384, 1536, 1536, 1536, 768] + - [19, 16581.0] + - - [1920, 960, 1, 384, 1920, 1920, 1920, 960] + - [17, 16560.0] + - - [2304, 1152, 1, 384, 2304, 2304, 2304, 1152] + - [5, 17393.0] + - - [2688, 1344, 1, 384, 2688, 2688, 2688, 1344] + - [17, 18049.0] + - - [3072, 1536, 1, 384, 3072, 3072, 3072, 1536] + - [47, 19648.0] + - - [3456, 1728, 1, 384, 3456, 3456, 3456, 1728] + - [33, 18766.0] + - - [3840, 1920, 1, 384, 3840, 3840, 3840, 1920] + - [34, 19368.0] + - - [4224, 2112, 1, 384, 4224, 4224, 4224, 2112] + - [17, 19134.0] + - - [4608, 2304, 1, 384, 4608, 4608, 4608, 2304] + - [4, 20217.0] + - - [4992, 2496, 1, 384, 4992, 4992, 4992, 2496] + - [1, 19501.0] + - - [5376, 2688, 1, 384, 5376, 5376, 5376, 2688] + - [37, 20065.0] + - - [5760, 2880, 1, 384, 5760, 5760, 5760, 2880] + - [18, 19850.0] + - - [6144, 3072, 1, 384, 6144, 6144, 6144, 3072] + - [37, 20503.0] + - - [6528, 3264, 1, 384, 6528, 6528, 6528, 3264] + - [37, 20046.0] + - - [6912, 3456, 1, 384, 6912, 6912, 6912, 3456] + - [4, 20397.0] + - - [7296, 3648, 1, 384, 7296, 7296, 7296, 3648] + - [18, 20175.0] + - - [7680, 3840, 1, 384, 7680, 7680, 7680, 3840] + - [37, 20711.0] + - - [768, 1536, 1, 384, 768, 768, 768, 1536] + - [35, 17422.0] + - - [1152, 2304, 1, 384, 1152, 1152, 1152, 2304] + - [5, 17488.0] + - - [1536, 3072, 1, 384, 1536, 1536, 1536, 3072] + - [38, 19742.0] + - - [1920, 3840, 1, 384, 1920, 1920, 1920, 3840] + - [18, 19391.0] + - - [2304, 4608, 1, 384, 2304, 2304, 2304, 4608] + - [4, 20265.0] + - - [2688, 5376, 1, 384, 2688, 2688, 2688, 5376] + - [18, 20111.0] + - - [3072, 6144, 1, 384, 3072, 3072, 3072, 6144] + - [14, 20531.0] + - - [3456, 6912, 1, 384, 3456, 3456, 3456, 6912] + - [4, 20442.0] + - - [3840, 7680, 1, 384, 3840, 3840, 3840, 7680] + - [37, 20583.0] + - - [4224, 8448, 1, 384, 4224, 4224, 4224, 8448] + - [4, 20521.0] + - - [4608, 9216, 1, 384, 4608, 4608, 4608, 9216] + - [37, 20628.0] + - - [4992, 9984, 1, 384, 4992, 4992, 4992, 9984] + - [37, 20664.0] + - - [5376, 10752, 1, 384, 5376, 5376, 5376, 10752] + - [37, 20729.0] + - - [5760, 11520, 1, 384, 5760, 5760, 5760, 11520] + - [21, 20738.0] + - - [6144, 12288, 1, 384, 6144, 6144, 6144, 12288] + - [21, 20737.0] + - - [6528, 13056, 1, 384, 6528, 6528, 6528, 13056] + - [21, 20785.0] + - - [6912, 13824, 1, 384, 6912, 6912, 6912, 13824] + - [37, 20819.0] + - - [7296, 14592, 1, 384, 7296, 7296, 7296, 14592] + - [4, 20831.0] + - - [7680, 15360, 1, 384, 7680, 7680, 7680, 15360] + - [37, 20840.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [33, 18438.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [3, 19032.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [22, 18342.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [38, 19203.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [22, 19592.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [22, 20060.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [33, 18127.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [20, 19261.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [23, 18793.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [23, 19521.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [39, 19908.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [36, 18736.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [36, 19395.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [5, 18970.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [22, 19946.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [5, 20247.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [20, 19485.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [23, 19324.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [13, 18781.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [21, 19673.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [37, 20201.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [33, 18927.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [3, 19138.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [23, 19642.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [31, 19491.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [37, 19490.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [4, 20099.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [3, 15776.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [3, 16570.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [3, 18169.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [22, 16891.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [38, 17606.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [33, 17103.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [23, 17851.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [23, 19448.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [28, 18124.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [3, 18616.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [33, 18985.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [37, 20367.0] + - - [512, 3280, 1, 1600, 512, 512, 512, 3280] + - [2, 18126.0] + - - [512, 3280, 1, 200, 512, 512, 512, 3280] + - [5, 15813.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [22, 16223.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [22, 17609.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [17, 18504.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] + - [33, 18700.0] + - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] + - [0, 18517.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [17, 18593.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [37, 20562.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [33, 18648.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [37, 20628.0] + - - [9216, 128, 1, 128, 9216, 9216, 9216, 128] + - [0, 12884.0] + - - [9600, 128, 1, 128, 9600, 9600, 9600, 128] + - [0, 13240.0] + - - [9984, 128, 1, 128, 9984, 9984, 9984, 128] + - [0, 13746.0] + - - [10368, 128, 1, 128, 10368, 10368, 10368, 128] + - [0, 14203.0] + - - [10752, 128, 1, 128, 10752, 10752, 10752, 128] + - [0, 14416.0] + - - [11136, 128, 1, 128, 11136, 11136, 11136, 128] + - [17, 14980.0] + - - [11520, 128, 1, 128, 11520, 11520, 11520, 128] + - [33, 13919.0] + - - [11904, 128, 1, 128, 11904, 11904, 11904, 128] + - [33, 14362.0] + - - [12288, 128, 1, 128, 12288, 12288, 12288, 128] + - [0, 14695.0] + - - [12672, 128, 1, 128, 12672, 12672, 12672, 128] + - [33, 15067.0] + - - [13056, 128, 1, 128, 13056, 13056, 13056, 128] + - [0, 15367.0] + - - [13440, 128, 1, 128, 13440, 13440, 13440, 128] + - [5, 15888.0] + - - [13824, 128, 1, 128, 13824, 13824, 13824, 128] + - [0, 15950.0] + - - [14208, 128, 1, 128, 14208, 14208, 14208, 128] + - [17, 15057.0] + - - [14592, 128, 1, 128, 14592, 14592, 14592, 128] + - [0, 15208.0] + - - [14976, 128, 1, 128, 14976, 14976, 14976, 128] + - [33, 15589.0] + - - [15360, 128, 1, 128, 15360, 15360, 15360, 128] + - [0, 15908.0] + - - [15744, 128, 1, 128, 15744, 15744, 15744, 128] + - [0, 16326.0] + - - [16128, 128, 1, 128, 16128, 16128, 16128, 128] + - [33, 15292.0] + - - [16512, 128, 1, 128, 16512, 16512, 16512, 128] + - [15, 15638.0] + - - [16896, 128, 1, 128, 16896, 16896, 16896, 128] + - [33, 15711.0] + - - [17280, 128, 1, 128, 17280, 17280, 17280, 128] + - [38, 16252.0] + - - [17664, 128, 1, 128, 17664, 17664, 17664, 128] + - [0, 16314.0] + - - [18048, 128, 1, 128, 18048, 18048, 18048, 128] + - [5, 17033.0] + - - [18432, 128, 1, 128, 18432, 18432, 18432, 128] + - [0, 16890.0] + - - [18816, 128, 1, 128, 18816, 18816, 18816, 128] + - [0, 15842.0] + - - [19200, 128, 1, 128, 19200, 19200, 19200, 128] + - [0, 15968.0] + - - [19584, 128, 1, 128, 19584, 19584, 19584, 128] + - [17, 16337.0] + - - [19968, 128, 1, 128, 19968, 19968, 19968, 128] + - [0, 16490.0] + - - [20352, 128, 1, 128, 20352, 20352, 20352, 128] + - [33, 16926.0] + - - [20736, 128, 1, 128, 20736, 20736, 20736, 128] + - [17, 16086.0] + - - [21120, 128, 1, 128, 21120, 21120, 21120, 128] + - [38, 16400.0] + - - [21504, 128, 1, 128, 21504, 21504, 21504, 128] + - [0, 16372.0] + - - [21888, 128, 1, 128, 21888, 21888, 21888, 128] + - [5, 16820.0] + - - [22272, 128, 1, 128, 22272, 22272, 22272, 128] + - [5, 16847.0] + - - [22656, 128, 1, 128, 22656, 22656, 22656, 128] + - [5, 17329.0] + - - [23040, 128, 1, 128, 23040, 23040, 23040, 128] + - [0, 17268.0] + - - [9216, 128, 1, 256, 9216, 9216, 9216, 128] + - [3, 16377.0] + - - [9600, 128, 1, 256, 9600, 9600, 9600, 128] + - [0, 14390.0] + - - [9984, 128, 1, 256, 9984, 9984, 9984, 128] + - [33, 14911.0] + - - [10368, 128, 1, 256, 10368, 10368, 10368, 128] + - [0, 15485.0] + - - [10752, 128, 1, 256, 10752, 10752, 10752, 128] + - [0, 15885.0] + - - [11136, 128, 1, 256, 11136, 11136, 11136, 128] + - [17, 16497.0] + - - [11520, 128, 1, 256, 11520, 11520, 11520, 128] + - [15, 14992.0] + - - [11904, 128, 1, 256, 11904, 11904, 11904, 128] + - [5, 15628.0] + - - [12288, 128, 1, 256, 12288, 12288, 12288, 128] + - [22, 15790.0] + - - [12672, 128, 1, 256, 12672, 12672, 12672, 128] + - [38, 16556.0] + - - [13056, 128, 1, 256, 13056, 13056, 13056, 128] + - [30, 16660.0] + - - [13440, 128, 1, 256, 13440, 13440, 13440, 128] + - [38, 17462.0] + - - [13824, 128, 1, 256, 13824, 13824, 13824, 128] + - [5, 17558.0] + - - [14208, 128, 1, 256, 14208, 14208, 14208, 128] + - [33, 15988.0] + - - [14592, 128, 1, 256, 14592, 14592, 14592, 128] + - [0, 16231.0] + - - [14976, 128, 1, 256, 14976, 14976, 14976, 128] + - [33, 16749.0] + - - [15360, 128, 1, 256, 15360, 15360, 15360, 128] + - [0, 17050.0] + - - [15744, 128, 1, 256, 15744, 15744, 15744, 128] + - [17, 17548.0] + - - [16128, 128, 1, 256, 16128, 16128, 16128, 128] + - [22, 16251.0] + - - [16512, 128, 1, 256, 16512, 16512, 16512, 128] + - [5, 16762.0] + - - [16896, 128, 1, 256, 16896, 16896, 16896, 128] + - [15, 16880.0] + - - [17280, 128, 1, 256, 17280, 17280, 17280, 128] + - [47, 17476.0] + - - [17664, 128, 1, 256, 17664, 17664, 17664, 128] + - [15, 17519.0] + - - [18048, 128, 1, 256, 18048, 18048, 18048, 128] + - [38, 18197.0] + - - [18432, 128, 1, 256, 18432, 18432, 18432, 128] + - [22, 18225.0] + - - [18816, 128, 1, 256, 18816, 18816, 18816, 128] + - [33, 16682.0] + - - [19200, 128, 1, 256, 19200, 19200, 19200, 128] + - [33, 16958.0] + - - [19584, 128, 1, 256, 19584, 19584, 19584, 128] + - [33, 17279.0] + - - [19968, 128, 1, 256, 19968, 19968, 19968, 128] + - [33, 17532.0] + - - [20352, 128, 1, 256, 20352, 20352, 20352, 128] + - [33, 17908.0] + - - [20736, 128, 1, 256, 20736, 20736, 20736, 128] + - [5, 17038.0] + - - [21120, 128, 1, 256, 21120, 21120, 21120, 128] + - [5, 17362.0] + - - [21504, 128, 1, 256, 21504, 21504, 21504, 128] + - [38, 17537.0] + - - [21888, 128, 1, 256, 21888, 21888, 21888, 128] + - [22, 17922.0] + - - [22272, 128, 1, 256, 22272, 22272, 22272, 128] + - [5, 18047.0] + - - [22656, 128, 1, 256, 22656, 22656, 22656, 128] + - [5, 18551.0] + - - [23040, 128, 1, 256, 23040, 23040, 23040, 128] + - [15, 18559.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 8064] + - [37, 20737.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 8448] + - [21, 20706.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 8832] + - [37, 20778.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 9216] + - [21, 20759.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 9600] + - [14, 20794.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 9984] + - [21, 20802.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 10368] + - [18, 20806.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 10752] + - [14, 20830.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 11136] + - [37, 20846.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 11520] + - [21, 20809.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 11904] + - [4, 20879.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 12288] + - [21, 20846.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 12672] + - [21, 20870.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 13056] + - [4, 20856.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 13440] + - [21, 20886.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 13824] + - [37, 20884.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 14208] + - [37, 20899.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 14592] + - [37, 20882.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 14976] + - [21, 20903.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 15360] + - [21, 20889.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 15744] + - [21, 20911.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 16128] + - [4, 20900.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 16512] + - [21, 20919.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 16896] + - [37, 20908.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 17280] + - [21, 20922.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 17664] + - [37, 20914.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 18048] + - [21, 20924.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 18432] + - [21, 20901.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 18816] + - [37, 20933.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 19200] + - [37, 20925.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 19584] + - [21, 20938.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 19968] + - [37, 20925.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 20352] + - [21, 20940.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 20736] + - [4, 20929.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 21120] + - [37, 20950.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 21504] + - [21, 20952.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 21888] + - [37, 20946.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 22272] + - [4, 20937.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 22656] + - [37, 20960.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 23040] + - [4, 20935.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [33, 16439.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [38, 19755.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [17, 18693.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [38, 20224.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [37, 19394.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [4, 20659.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [37, 20030.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [22, 20473.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [37, 20426.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [21, 20906.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [37, 20632.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [37, 20562.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [4, 21022.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [37, 20865.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [4, 20830.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [21, 20912.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [4, 21050.0] + - - [8064, 4032, 1, 384, 8064, 8064, 8064, 4032] + - [4, 20238.0] + - - [8448, 4224, 1, 384, 8448, 8448, 8448, 4224] + - [37, 20519.0] + - - [8832, 4416, 1, 384, 8832, 8832, 8832, 4416] + - [4, 20127.0] + - - [9216, 4608, 1, 384, 9216, 9216, 9216, 4608] + - [4, 20607.0] + - - [9600, 4800, 1, 384, 9600, 9600, 9600, 4800] + - [4, 20249.0] + - - [9984, 4992, 1, 384, 9984, 9984, 9984, 4992] + - [37, 20636.0] + - - [10368, 5184, 1, 384, 10368, 10368, 10368, 5184] + - [37, 20352.0] + - - [10752, 5376, 1, 384, 10752, 10752, 10752, 5376] + - [4, 20717.0] + - - [11136, 5568, 1, 384, 11136, 11136, 11136, 5568] + - [21, 20429.0] + - - [11520, 5760, 1, 384, 11520, 11520, 11520, 5760] + - [37, 20714.0] + - - [11904, 5952, 1, 384, 11904, 11904, 11904, 5952] + - [4, 20491.0] + - - [12288, 6144, 1, 384, 12288, 12288, 12288, 6144] + - [21, 20745.0] + - - [12672, 6336, 1, 384, 12672, 12672, 12672, 6336] + - [4, 20545.0] + - - [13056, 6528, 1, 384, 13056, 13056, 13056, 6528] + - [4, 20770.0] + - - [13440, 6720, 1, 384, 13440, 13440, 13440, 6720] + - [21, 20599.0] + - - [13824, 6912, 1, 384, 13824, 13824, 13824, 6912] + - [37, 20809.0] + - - [14208, 7104, 1, 384, 14208, 14208, 14208, 7104] + - [4, 20627.0] + - - [14592, 7296, 1, 384, 14592, 14592, 14592, 7296] + - [37, 20808.0] + - - [14976, 7488, 1, 384, 14976, 14976, 14976, 7488] + - [4, 20659.0] + - - [15360, 7680, 1, 384, 15360, 15360, 15360, 7680] + - [21, 20816.0] + - - [15744, 7872, 1, 384, 15744, 15744, 15744, 7872] + - [4, 20673.0] + - - [16128, 8064, 1, 384, 16128, 16128, 16128, 8064] + - [4, 20837.0] + - - [16512, 8256, 1, 384, 16512, 16512, 16512, 8256] + - [37, 20722.0] + - - [16896, 8448, 1, 384, 16896, 16896, 16896, 8448] + - [37, 20860.0] + - - [17280, 8640, 1, 384, 17280, 17280, 17280, 8640] + - [4, 20677.0] + - - [17664, 8832, 1, 384, 17664, 17664, 17664, 8832] + - [4, 20859.0] + - - [18048, 9024, 1, 384, 18048, 18048, 18048, 9024] + - [4, 20698.0] + - - [18432, 9216, 1, 384, 18432, 18432, 18432, 9216] + - [21, 20851.0] + - - [18816, 9408, 1, 384, 18816, 18816, 18816, 9408] + - [37, 20711.0] + - - [19200, 9600, 1, 384, 19200, 19200, 19200, 9600] + - [37, 20886.0] + - - [19584, 9792, 1, 384, 19584, 19584, 19584, 9792] + - [21, 20733.0] + - - [19968, 9984, 1, 384, 19968, 19968, 19968, 9984] + - [21, 20874.0] + - - [20352, 10176, 1, 384, 20352, 20352, 20352, 10176] + - [4, 20749.0] + - - [20736, 10368, 1, 384, 20736, 20736, 20736, 10368] + - [37, 20903.0] + - - [21120, 10560, 1, 384, 21120, 21120, 21120, 10560] + - [21, 20764.0] + - - [21504, 10752, 1, 384, 21504, 21504, 21504, 10752] + - [21, 20883.0] + - - [21888, 10944, 1, 384, 21888, 21888, 21888, 10944] + - [37, 20773.0] + - - [22272, 11136, 1, 384, 22272, 22272, 22272, 11136] + - [37, 20911.0] + - - [22656, 11328, 1, 384, 22656, 22656, 22656, 11328] + - [21, 20796.0] + - - [23040, 11520, 1, 384, 23040, 23040, 23040, 11520] + - [37, 20909.0] + - - [8064, 16128, 1, 384, 8064, 8064, 8064, 16128] + - [37, 20857.0] + - - [8448, 16896, 1, 384, 8448, 8448, 8448, 16896] + - [37, 20857.0] + - - [8832, 17664, 1, 384, 8832, 8832, 8832, 17664] + - [21, 20876.0] + - - [9216, 18432, 1, 384, 9216, 9216, 9216, 18432] + - [21, 20847.0] + - - [9600, 19200, 1, 384, 9600, 9600, 9600, 19200] + - [21, 20893.0] + - - [9984, 19968, 1, 384, 9984, 9984, 9984, 19968] + - [37, 20882.0] + - - [10368, 20736, 1, 384, 10368, 10368, 10368, 20736] + - [37, 20910.0] + - - [10752, 21504, 1, 384, 10752, 10752, 10752, 21504] + - [37, 20892.0] + - - [11136, 22272, 1, 384, 11136, 11136, 11136, 22272] + - [37, 20918.0] + - - [11520, 23040, 1, 384, 11520, 11520, 11520, 23040] + - [37, 20905.0] + - - [11904, 23808, 1, 384, 11904, 11904, 11904, 23808] + - [21, 20931.0] + - - [12288, 24576, 1, 384, 12288, 12288, 12288, 24576] + - [21, 20908.0] + - - [12672, 25344, 1, 384, 12672, 12672, 12672, 25344] + - [21, 20933.0] + - - [13056, 26112, 1, 384, 13056, 13056, 13056, 26112] + - [21, 20914.0] + - - [13440, 26880, 1, 384, 13440, 13440, 13440, 26880] + - [21, 20938.0] + - - [13824, 27648, 1, 384, 13824, 13824, 13824, 27648] + - [37, 20928.0] + - - [14208, 28416, 1, 384, 14208, 14208, 14208, 28416] + - [4, 20976.0] + - - [14592, 29184, 1, 384, 14592, 14592, 14592, 29184] + - [37, 20930.0] + - - [14976, 29952, 1, 384, 14976, 14976, 14976, 29952] + - [21, 20948.0] + - - [15360, 30720, 1, 384, 15360, 15360, 15360, 30720] + - [21, 20927.0] + - - [15744, 31488, 1, 384, 15744, 15744, 15744, 31488] + - [21, 20950.0] + - - [16128, 32256, 1, 384, 16128, 16128, 16128, 32256] + - [37, 20940.0] + - - [16512, 33024, 1, 384, 16512, 16512, 16512, 33024] + - [21, 20957.0] + - - [16896, 33792, 1, 384, 16896, 16896, 16896, 33792] + - [37, 20944.0] + - - [17280, 34560, 1, 384, 17280, 17280, 17280, 34560] + - [37, 20960.0] + - - [17664, 35328, 1, 384, 17664, 17664, 17664, 35328] + - [37, 20966.0] + - - [18048, 36096, 1, 384, 18048, 18048, 18048, 36096] + - [37, 20957.0] + - - [18432, 36864, 1, 384, 18432, 18432, 18432, 36864] + - [21, 20935.0] + - - [18816, 37632, 1, 384, 18816, 18816, 18816, 37632] + - [37, 20975.0] + - - [19200, 38400, 1, 384, 19200, 19200, 19200, 38400] + - [37, 20950.0] + - - [19584, 39168, 1, 384, 19584, 19584, 19584, 39168] + - [21, 20973.0] + - - [19968, 39936, 1, 384, 19968, 19968, 19968, 39936] + - [4, 20963.0] + - - [20352, 40704, 1, 384, 20352, 20352, 20352, 40704] + - [37, 20978.0] + - - [20736, 41472, 1, 384, 20736, 20736, 20736, 41472] + - [37, 20976.0] + - - [21120, 42240, 1, 384, 21120, 21120, 21120, 42240] + - [37, 20980.0] + - - [21504, 43008, 1, 384, 21504, 21504, 21504, 43008] + - [4, 20970.0] + - - [21888, 43776, 1, 384, 21888, 21888, 21888, 43776] + - [37, 20973.0] + - - [22272, 44544, 1, 384, 22272, 22272, 22272, 44544] + - [21, 20973.0] + - - [22656, 45312, 1, 384, 22656, 22656, 22656, 45312] + - [37, 20990.0] + - - [23040, 46080, 1, 384, 23040, 23040, 23040, 46080] + - [21, 20979.0] + - - [1152, 1536, 1, 384, 1152, 1152, 1152, 1536] + - [38, 18062.0] + - - [1920, 1536, 1, 384, 1920, 1920, 1920, 1536] + - [5, 19319.0] + - - [2304, 1536, 1, 384, 2304, 2304, 2304, 1536] + - [38, 19469.0] + - - [2688, 1536, 1, 384, 2688, 2688, 2688, 1536] + - [5, 19622.0] + - - [3456, 1536, 1, 384, 3456, 3456, 3456, 1536] + - [5, 19825.0] + - - [3840, 1536, 1, 384, 3840, 3840, 3840, 1536] + - [22, 19826.0] + - - [4224, 1536, 1, 384, 4224, 4224, 4224, 1536] + - [38, 19915.0] + - - [4608, 1536, 1, 384, 4608, 4608, 4608, 1536] + - [4, 19970.0] + - - [4992, 1536, 1, 384, 4992, 4992, 4992, 1536] + - [5, 19976.0] + - - [5376, 1536, 1, 384, 5376, 5376, 5376, 1536] + - [4, 20061.0] + - - [5760, 1536, 1, 384, 5760, 5760, 5760, 1536] + - [5, 20039.0] + - - [6144, 1536, 1, 384, 6144, 6144, 6144, 1536] + - [1, 20135.0] + - - [6528, 1536, 1, 384, 6528, 6528, 6528, 1536] + - [38, 20098.0] + - - [6912, 1536, 1, 384, 6912, 6912, 6912, 1536] + - [4, 20239.0] + - - [7296, 1536, 1, 384, 7296, 7296, 7296, 1536] + - [38, 20132.0] + - - [7680, 1536, 1, 384, 7680, 7680, 7680, 1536] + - [4, 20299.0] + - - [8064, 1536, 1, 384, 8064, 8064, 8064, 1536] + - [38, 20171.0] + - - [8448, 1536, 1, 384, 8448, 8448, 8448, 1536] + - [4, 20340.0] + - - [8832, 1536, 1, 384, 8832, 8832, 8832, 1536] + - [38, 20179.0] + - - [9216, 1536, 1, 384, 9216, 9216, 9216, 1536] + - [4, 20380.0] + - - [9600, 1536, 1, 384, 9600, 9600, 9600, 1536] + - [38, 20207.0] + - - [9984, 1536, 1, 384, 9984, 9984, 9984, 1536] + - [4, 20403.0] + - - [10368, 1536, 1, 384, 10368, 10368, 10368, 1536] + - [5, 20221.0] + - - [10752, 1536, 1, 384, 10752, 10752, 10752, 1536] + - [4, 20471.0] + - - [11136, 1536, 1, 384, 11136, 11136, 11136, 1536] + - [38, 20240.0] + - - [11520, 1536, 1, 384, 11520, 11520, 11520, 1536] + - [37, 20486.0] + - - [11904, 1536, 1, 384, 11904, 11904, 11904, 1536] + - [38, 20256.0] + - - [12288, 1536, 1, 384, 12288, 12288, 12288, 1536] + - [4, 20562.0] + - - [12672, 1536, 1, 384, 12672, 12672, 12672, 1536] + - [22, 20266.0] + - - [13056, 1536, 1, 384, 13056, 13056, 13056, 1536] + - [4, 20536.0] + - - [13440, 1536, 1, 384, 13440, 13440, 13440, 1536] + - [22, 20279.0] + - - [13824, 1536, 1, 384, 13824, 13824, 13824, 1536] + - [46, 20569.0] + - - [14208, 1536, 1, 384, 14208, 14208, 14208, 1536] + - [38, 20287.0] + - - [14592, 1536, 1, 384, 14592, 14592, 14592, 1536] + - [4, 20569.0] + - - [14976, 1536, 1, 384, 14976, 14976, 14976, 1536] + - [22, 20296.0] + - - [15360, 1536, 1, 384, 15360, 15360, 15360, 1536] + - [4, 20572.0] + - - [15744, 1536, 1, 384, 15744, 15744, 15744, 1536] + - [38, 20299.0] + - - [16128, 1536, 1, 384, 16128, 16128, 16128, 1536] + - [4, 20597.0] + - - [16512, 1536, 1, 384, 16512, 16512, 16512, 1536] + - [38, 20321.0] + - - [16896, 1536, 1, 384, 16896, 16896, 16896, 1536] + - [37, 20618.0] + - - [17280, 1536, 1, 384, 17280, 17280, 17280, 1536] + - [38, 20293.0] + - - [17664, 1536, 1, 384, 17664, 17664, 17664, 1536] + - [46, 20598.0] + - - [18048, 1536, 1, 384, 18048, 18048, 18048, 1536] + - [22, 20280.0] + - - [18432, 1536, 1, 384, 18432, 18432, 18432, 1536] + - [4, 20551.0] + - - [18816, 1536, 1, 384, 18816, 18816, 18816, 1536] + - [38, 20288.0] + - - [19200, 1536, 1, 384, 19200, 19200, 19200, 1536] + - [4, 20564.0] + - - [19584, 1536, 1, 384, 19584, 19584, 19584, 1536] + - [22, 20292.0] + - - [19968, 1536, 1, 384, 19968, 19968, 19968, 1536] + - [4, 20597.0] + - - [20352, 1536, 1, 384, 20352, 20352, 20352, 1536] + - [34, 20325.0] + - - [20736, 1536, 1, 384, 20736, 20736, 20736, 1536] + - [37, 20585.0] + - - [21120, 1536, 1, 384, 21120, 21120, 21120, 1536] + - [37, 20327.0] + - - [21504, 1536, 1, 384, 21504, 21504, 21504, 1536] + - [4, 20556.0] + - - [21888, 1536, 1, 384, 21888, 21888, 21888, 1536] + - [37, 20357.0] + - - [22272, 1536, 1, 384, 22272, 22272, 22272, 1536] + - [37, 20610.0] + - - [22656, 1536, 1, 384, 22656, 22656, 22656, 1536] + - [34, 20364.0] + - - [23040, 1536, 1, 384, 23040, 23040, 23040, 1536] + - [37, 20598.0] + - - [768, 1920, 1, 384, 768, 768, 768, 1920] + - [38, 15156.0] + - - [1152, 1920, 1, 384, 1152, 1152, 1152, 1920] + - [38, 17979.0] + - - [1536, 1920, 1, 384, 1536, 1536, 1536, 1920] + - [22, 19168.0] + - - [2304, 1920, 1, 384, 2304, 2304, 2304, 1920] + - [38, 18549.0] + - - [2688, 1920, 1, 384, 2688, 2688, 2688, 1920] + - [5, 19297.0] + - - [3072, 1920, 1, 384, 3072, 3072, 3072, 1920] + - [1, 19816.0] + - - [3456, 1920, 1, 384, 3456, 3456, 3456, 1920] + - [17, 19058.0] + - - [4224, 1920, 1, 384, 4224, 4224, 4224, 1920] + - [34, 19831.0] + - - [4608, 1920, 1, 384, 4608, 4608, 4608, 1920] + - [38, 20077.0] + - - [4992, 1920, 1, 384, 4992, 4992, 4992, 1920] + - [37, 19471.0] + - - [5376, 1920, 1, 384, 5376, 5376, 5376, 1920] + - [37, 19761.0] + - - [5760, 1920, 1, 384, 5760, 5760, 5760, 1920] + - [37, 20073.0] + - - [6144, 1920, 1, 384, 6144, 6144, 6144, 1920] + - [4, 20313.0] + - - [6528, 1920, 1, 384, 6528, 6528, 6528, 1920] + - [21, 19763.0] + - - [6912, 1920, 1, 384, 6912, 6912, 6912, 1920] + - [4, 19986.0] + - - [7296, 1920, 1, 384, 7296, 7296, 7296, 1920] + - [34, 20241.0] + - - [7680, 1920, 1, 384, 7680, 7680, 7680, 1920] + - [30, 20233.0] + - - [8064, 1920, 1, 384, 8064, 8064, 8064, 1920] + - [18, 19966.0] + - - [8448, 1920, 1, 384, 8448, 8448, 8448, 1920] + - [4, 20144.0] + - - [8832, 1920, 1, 384, 8832, 8832, 8832, 1920] + - [4, 20376.0] + - - [9216, 1920, 1, 384, 9216, 9216, 9216, 1920] + - [4, 20501.0] + - - [9600, 1920, 1, 384, 9600, 9600, 9600, 1920] + - [37, 20136.0] + - - [9984, 1920, 1, 384, 9984, 9984, 9984, 1920] + - [37, 20255.0] + - - [10368, 1920, 1, 384, 10368, 10368, 10368, 1920] + - [21, 20438.0] + - - [10752, 1920, 1, 384, 10752, 10752, 10752, 1920] + - [22, 20273.0] + - - [11136, 1920, 1, 384, 11136, 11136, 11136, 1920] + - [37, 20227.0] + - - [11520, 1920, 1, 384, 11520, 11520, 11520, 1920] + - [37, 20351.0] + - - [11904, 1920, 1, 384, 11904, 11904, 11904, 1920] + - [21, 20507.0] + - - [12288, 1920, 1, 384, 12288, 12288, 12288, 1920] + - [4, 20590.0] + - - [12672, 1920, 1, 384, 12672, 12672, 12672, 1920] + - [37, 20319.0] + - - [13056, 1920, 1, 384, 13056, 13056, 13056, 1920] + - [37, 20417.0] + - - [13440, 1920, 1, 384, 13440, 13440, 13440, 1920] + - [37, 20560.0] + - - [13824, 1920, 1, 384, 13824, 13824, 13824, 1920] + - [47, 20318.0] + - - [14208, 1920, 1, 384, 14208, 14208, 14208, 1920] + - [46, 20345.0] + - - [14592, 1920, 1, 384, 14592, 14592, 14592, 1920] + - [34, 20386.0] + - - [14976, 1920, 1, 384, 14976, 14976, 14976, 1920] + - [21, 20518.0] + - - [15360, 1920, 1, 384, 15360, 15360, 15360, 1920] + - [4, 20565.0] + - - [15744, 1920, 1, 384, 15744, 15744, 15744, 1920] + - [34, 20374.0] + - - [16128, 1920, 1, 384, 16128, 16128, 16128, 1920] + - [37, 20444.0] + - - [16512, 1920, 1, 384, 16512, 16512, 16512, 1920] + - [21, 20551.0] + - - [16896, 1920, 1, 384, 16896, 16896, 16896, 1920] + - [22, 20331.0] + - - [17280, 1920, 1, 384, 17280, 17280, 17280, 1920] + - [21, 20425.0] + - - [17664, 1920, 1, 384, 17664, 17664, 17664, 1920] + - [37, 20487.0] + - - [18048, 1920, 1, 384, 18048, 18048, 18048, 1920] + - [37, 20601.0] + - - [18432, 1920, 1, 384, 18432, 18432, 18432, 1920] + - [4, 20605.0] + - - [18816, 1920, 1, 384, 18816, 18816, 18816, 1920] + - [21, 20462.0] + - - [19200, 1920, 1, 384, 19200, 19200, 19200, 1920] + - [37, 20515.0] + - - [19584, 1920, 1, 384, 19584, 19584, 19584, 1920] + - [21, 20609.0] + - - [19968, 1920, 1, 384, 19968, 19968, 19968, 1920] + - [21, 20417.0] + - - [20352, 1920, 1, 384, 20352, 20352, 20352, 1920] + - [37, 20498.0] + - - [20736, 1920, 1, 384, 20736, 20736, 20736, 1920] + - [37, 20564.0] + - - [21120, 1920, 1, 384, 21120, 21120, 21120, 1920] + - [21, 20650.0] + - - [21504, 1920, 1, 384, 21504, 21504, 21504, 1920] + - [4, 20611.0] + - - [21888, 1920, 1, 384, 21888, 21888, 21888, 1920] + - [21, 20538.0] + - - [22272, 1920, 1, 384, 22272, 22272, 22272, 1920] + - [46, 20553.0] + - - [22656, 1920, 1, 384, 22656, 22656, 22656, 1920] + - [37, 20665.0] + - - [23040, 1920, 1, 384, 23040, 23040, 23040, 1920] + - [37, 20483.0] + - - [768, 2304, 1, 384, 768, 768, 768, 2304] + - [35, 17881.0] + - - [1536, 2304, 1, 384, 1536, 1536, 1536, 2304] + - [22, 19391.0] + - - [1920, 2304, 1, 384, 1920, 1920, 1920, 2304] + - [38, 18593.0] + - - [2688, 2304, 1, 384, 2688, 2688, 2688, 2304] + - [34, 19136.0] + - - [3072, 2304, 1, 384, 3072, 3072, 3072, 2304] + - [1, 19985.0] + - - [3456, 2304, 1, 384, 3456, 3456, 3456, 2304] + - [34, 19498.0] + - - [3840, 2304, 1, 384, 3840, 3840, 3840, 2304] + - [5, 19987.0] + - - [4224, 2304, 1, 384, 4224, 4224, 4224, 2304] + - [21, 19738.0] + - - [4992, 2304, 1, 384, 4992, 4992, 4992, 2304] + - [4, 19884.0] + - - [5376, 2304, 1, 384, 5376, 5376, 5376, 2304] + - [22, 20108.0] + - - [5760, 2304, 1, 384, 5760, 5760, 5760, 2304] + - [37, 20016.0] + - - [6144, 2304, 1, 384, 6144, 6144, 6144, 2304] + - [4, 20391.0] + - - [6528, 2304, 1, 384, 6528, 6528, 6528, 2304] + - [37, 20158.0] + - - [6912, 2304, 1, 384, 6912, 6912, 6912, 2304] + - [47, 20198.0] + - - [7296, 2304, 1, 384, 7296, 7296, 7296, 2304] + - [21, 20225.0] + - - [7680, 2304, 1, 384, 7680, 7680, 7680, 2304] + - [37, 20491.0] + - - [8064, 2304, 1, 384, 8064, 8064, 8064, 2304] + - [21, 20266.0] + - - [8448, 2304, 1, 384, 8448, 8448, 8448, 2304] + - [22, 20270.0] + - - [8832, 2304, 1, 384, 8832, 8832, 8832, 2304] + - [37, 20329.0] + - - [9216, 2304, 1, 384, 9216, 9216, 9216, 2304] + - [4, 20546.0] + - - [9600, 2304, 1, 384, 9600, 9600, 9600, 2304] + - [37, 20374.0] + - - [9984, 2304, 1, 384, 9984, 9984, 9984, 2304] + - [38, 20281.0] + - - [10368, 2304, 1, 384, 10368, 10368, 10368, 2304] + - [37, 20430.0] + - - [10752, 2304, 1, 384, 10752, 10752, 10752, 2304] + - [37, 20614.0] + - - [11136, 2304, 1, 384, 11136, 11136, 11136, 2304] + - [34, 20439.0] + - - [11520, 2304, 1, 384, 11520, 11520, 11520, 2304] + - [47, 20280.0] + - - [11904, 2304, 1, 384, 11904, 11904, 11904, 2304] + - [46, 20423.0] + - - [12288, 2304, 1, 384, 12288, 12288, 12288, 2304] + - [4, 20531.0] + - - [12672, 2304, 1, 384, 12672, 12672, 12672, 2304] + - [37, 20435.0] + - - [13056, 2304, 1, 384, 13056, 13056, 13056, 2304] + - [22, 20298.0] + - - [13440, 2304, 1, 384, 13440, 13440, 13440, 2304] + - [37, 20470.0] + - - [13824, 2304, 1, 384, 13824, 13824, 13824, 2304] + - [37, 20589.0] + - - [14208, 2304, 1, 384, 14208, 14208, 14208, 2304] + - [18, 20508.0] + - - [14592, 2304, 1, 384, 14592, 14592, 14592, 2304] + - [37, 20340.0] + - - [14976, 2304, 1, 384, 14976, 14976, 14976, 2304] + - [37, 20514.0] + - - [15360, 2304, 1, 384, 15360, 15360, 15360, 2304] + - [4, 20581.0] + - - [15744, 2304, 1, 384, 15744, 15744, 15744, 2304] + - [21, 20527.0] + - - [16128, 2304, 1, 384, 16128, 16128, 16128, 2304] + - [37, 20401.0] + - - [16512, 2304, 1, 384, 16512, 16512, 16512, 2304] + - [21, 20562.0] + - - [16896, 2304, 1, 384, 16896, 16896, 16896, 2304] + - [37, 20651.0] + - - [17280, 2304, 1, 384, 17280, 17280, 17280, 2304] + - [37, 20574.0] + - - [17664, 2304, 1, 384, 17664, 17664, 17664, 2304] + - [37, 20440.0] + - - [18048, 2304, 1, 384, 18048, 18048, 18048, 2304] + - [37, 20588.0] + - - [18432, 2304, 1, 384, 18432, 18432, 18432, 2304] + - [4, 20594.0] + - - [18816, 2304, 1, 384, 18816, 18816, 18816, 2304] + - [21, 20608.0] + - - [19200, 2304, 1, 384, 19200, 19200, 19200, 2304] + - [37, 20487.0] + - - [19584, 2304, 1, 384, 19584, 19584, 19584, 2304] + - [37, 20626.0] + - - [19968, 2304, 1, 384, 19968, 19968, 19968, 2304] + - [37, 20664.0] + - - [20352, 2304, 1, 384, 20352, 20352, 20352, 2304] + - [21, 20633.0] + - - [20736, 2304, 1, 384, 20736, 20736, 20736, 2304] + - [37, 20526.0] + - - [21120, 2304, 1, 384, 21120, 21120, 21120, 2304] + - [21, 20640.0] + - - [21504, 2304, 1, 384, 21504, 21504, 21504, 2304] + - [4, 20666.0] + - - [21888, 2304, 1, 384, 21888, 21888, 21888, 2304] + - [37, 20658.0] + - - [22272, 2304, 1, 384, 22272, 22272, 22272, 2304] + - [4, 20541.0] + - - [22656, 2304, 1, 384, 22656, 22656, 22656, 2304] + - [37, 20659.0] + - - [23040, 2304, 1, 384, 23040, 23040, 23040, 2304] + - [37, 20707.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [9, 543.0] + - - [289, 128, 64, 768, 289, 289, 289, 128] + - [17, 13878.0] + - - [289, 160, 64, 768, 289, 289, 289, 160] + - [17, 12163.0] + - - [289, 192, 64, 768, 289, 289, 289, 192] + - [17, 14561.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [33, 18486.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [9, 16695.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [17, 16034.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [33, 14504.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [0, 17640.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [33, 17318.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [17, 17144.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [0, 18635.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [5, 18336.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [5, 18085.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [0, 18341.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [19, 16933.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [33, 16924.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [21, 21041.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [37, 20592.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [34, 19118.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [4, 21037.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [37, 20588.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [22, 19147.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [21, 21062.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [37, 20468.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [17, 18540.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [33, 18336.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [0, 18604.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [33, 18227.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [33, 18509.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [1, 19524.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [17, 18485.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [1, 20382.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [33, 18613.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [1, 19981.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [33, 18616.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [21, 20431.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [17, 18612.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [21, 20401.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [33, 18615.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [33, 18564.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [1, 20144.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [17, 18556.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [4, 20196.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [33, 18449.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [38, 19348.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [33, 18438.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [42, 18501.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [42, 19418.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [33, 18477.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [33, 18617.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [21, 20887.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [4, 20873.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [3, 18651.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [36, 18603.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [36, 18611.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [33, 18620.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [37, 20848.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [4, 20838.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [37, 20843.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [36, 18623.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [36, 18604.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [20, 18584.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [36, 18597.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [36, 18597.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [20, 18602.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [9, 355.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [9, 413.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [9, 949.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [4, 20850.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [18, 19984.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [4, 20360.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [21, 20335.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [21, 20670.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [17, 18837.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [4, 20598.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [37, 20632.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [33, 18843.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [36, 19201.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [0, 19062.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [3, 1925.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [37, 20447.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [38, 18028.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [38, 19919.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [37, 20523.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [21, 20950.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [37, 20661.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [21, 20666.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [23, 19970.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [23, 20574.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [4, 20949.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [21, 20704.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [21, 20699.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [36, 18869.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [13, 18627.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [21, 20929.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [28, 18617.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [4, 20883.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [4, 20905.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [37, 20937.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [37, 20626.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [9, 336.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [21, 20396.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [37, 20519.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [20, 18610.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [37, 20541.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [18, 20532.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [45, 18607.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [18, 19673.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [33, 18620.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [25, 19976.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [42, 18594.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [0, 17848.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [18, 17626.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [0, 17521.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [21, 19850.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [3, 14717.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [1, 17574.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [18, 17943.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [34, 15167.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [37, 15466.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [33, 18591.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [33, 16957.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [0, 19012.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [37, 19399.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [33, 14100.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [5, 17380.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [10, 17750.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [33, 14654.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [4, 14736.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [33, 18642.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [28, 18641.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [3, 18850.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [3, 18853.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [20, 18645.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [3, 18861.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [20, 18644.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [3, 18842.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [45, 18638.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [28, 18861.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [23, 17874.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [23, 17867.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [23, 17862.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [36, 18885.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [21, 20381.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [21, 20525.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [17, 9022.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [21, 20625.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [21, 20382.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [9, 10510.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [37, 20614.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [37, 20186.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [1, 15626.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [1, 15714.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [0, 16319.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [36, 15447.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [4, 17797.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [1, 17934.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [21, 20574.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [20, 18646.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [37, 20583.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [45, 18629.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [36, 12300.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [36, 12353.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [45, 12443.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [36, 12452.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [21, 20267.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [21, 20000.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [37, 19867.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [37, 20018.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [4, 20700.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [4, 20502.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [4, 20427.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [37, 20629.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [38, 19781.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [38, 19869.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [22, 19960.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [5, 19956.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [33, 18650.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [1, 19697.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [21, 19591.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [21, 19764.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [37, 19722.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [4, 19619.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [37, 19814.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [18, 19420.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [21, 19478.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [21, 19892.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [37, 19755.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [17, 19354.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [37, 19402.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [1, 17716.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [21, 19708.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [37, 20281.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [33, 16907.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [37, 20142.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [11, 18875.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [33, 19040.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [38, 18268.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [30, 18907.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [37, 20355.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [33, 18315.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [30, 17914.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [1, 18326.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [38, 18742.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [0, 15953.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [38, 19618.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [22, 19521.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [38, 17402.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [38, 18189.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [5, 19784.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [15, 19617.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [33, 17357.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [33, 18504.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [0, 19123.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [17, 17219.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [22, 19472.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [5, 19264.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [17, 16483.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [37, 20427.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [33, 16930.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [14, 20108.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [5, 18783.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [42, 19169.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [33, 18240.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [17, 17621.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [33, 17629.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [33, 18589.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [33, 16080.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [17, 15761.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [9, 16718.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [22, 19200.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [17, 17298.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [10, 19005.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [1, 18976.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 256] + - [38, 16895.0] + - - [888, 2048, 2, 512, 888, 888, 888, 2048] + - [17, 18158.0] + - - [12880, 512, 2, 256, 12880, 12880, 12880, 512] + - [33, 19374.0] + - - [12288, 512, 2, 256, 12288, 12288, 12288, 512] + - [1, 19603.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 128] + - [38, 19102.0] + - - [864, 2048, 2, 256, 864, 864, 864, 2048] + - [17, 17276.0] + - - [12672, 128, 2, 512, 12672, 12672, 12672, 128] + - [38, 18069.0] + - - [11264, 128, 2, 512, 11264, 11264, 11264, 128] + - [38, 18933.0] + - - [11776, 128, 2, 512, 11776, 11776, 11776, 128] + - [33, 17786.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 128] + - [5, 19668.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 1024] + - [0, 18707.0] + - - [14000, 128, 2, 512, 14000, 14000, 14000, 128] + - [33, 17678.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 512] + - [33, 19286.0] + - - [805, 2048, 2, 256, 805, 805, 805, 2048] + - [0, 16048.0] + - - [768, 2048, 2, 256, 768, 768, 768, 2048] + - [0, 18162.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 1024] + - [1, 18369.0] + - - [1251, 256, 49, 256, 1251, 1251, 1251, 256] + - [0, 19021.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 256] + - [38, 17593.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 1024] + - [0, 18263.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 1024] + - [0, 18575.0] + - - [15200, 256, 2, 12, 15200, 15200, 15200, 256] + - [22, 5392.0] + - - [12880, 256, 2, 12, 12880, 12880, 12880, 256] + - [5, 5276.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 1024] + - [5, 18826.0] + - - [13600, 256, 2, 12, 13600, 13600, 13600, 256] + - [47, 5343.0] + - - [15200, 256, 2, 3, 15200, 15200, 15200, 256] + - [35, 1497.0] + - - [12880, 256, 2, 3, 12880, 12880, 12880, 256] + - [19, 1476.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 1024] + - [15, 19175.0] + - - [12288, 256, 2, 12, 12288, 12288, 12288, 256] + - [13, 5347.0] + - - [13824, 256, 2, 12, 13824, 13824, 13824, 256] + - [3, 5438.0] + - - [13600, 256, 2, 3, 13600, 13600, 13600, 256] + - [5, 1505.0] + - - [1900, 1024, 1, 2048, 1900, 1900, 1900, 1024] + - [33, 17939.0] + - - [7600, 512, 1, 256, 7600, 7600, 7600, 512] + - [2, 17968.0] + - - [1610, 1024, 1, 2048, 1610, 1610, 1610, 1024] + - [15, 18056.0] + - - [6144, 512, 1, 256, 6144, 6144, 6144, 512] + - [0, 18089.0] + - - [1900, 1024, 1, 512, 1900, 1900, 1900, 1024] + - [0, 17336.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [37, 20517.0] + - - [3220, 256, 2, 12, 3220, 3220, 3220, 256] + - [5, 3691.0] + - - [3220, 256, 2, 3, 3220, 3220, 3220, 256] + - [19, 1066.0] + - - [3800, 256, 2, 3, 3800, 3800, 3800, 256] + - [33, 1089.0] + - - [13824, 256, 2, 3, 13824, 13824, 13824, 256] + - [3, 1506.0] + - - [12288, 256, 2, 3, 12288, 12288, 12288, 256] + - [0, 1547.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 256] + - [17, 17174.0] + - - [3072, 256, 2, 12, 3072, 3072, 3072, 256] + - [0, 3630.0] + - - [3800, 256, 2, 12, 3800, 3800, 3800, 256] + - [38, 3865.0] + - - [3072, 256, 2, 3, 3072, 3072, 3072, 256] + - [0, 1102.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 256] + - [33, 15933.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 512] + - [47, 19731.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 256] + - [17, 15234.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 256] + - [17, 14929.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 1024] + - [5, 18731.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 256] + - [38, 16579.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 1024] + - [5, 18648.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 256] + - [0, 17961.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 256] + - [5, 16980.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 1024] + - [17, 18450.0] + - - [3456, 256, 2, 3, 3456, 3456, 3456, 256] + - [35, 1129.0] + - - [3400, 256, 2, 3, 3400, 3400, 3400, 256] + - [19, 1079.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 1024] + - [38, 19249.0] + - - [3456, 256, 2, 12, 3456, 3456, 3456, 256] + - [5, 4006.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 1024] + - [33, 18796.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 256] + - [19, 17418.0] + - - [850, 2048, 2, 256, 850, 850, 850, 2048] + - [0, 16983.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 256] + - [5, 16763.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 1024] + - [0, 18142.0] + - - [51520, 256, 2, 12, 51520, 51520, 51520, 256] + - [10, 7483.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 256] + - [38, 16993.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 1024] + - [9, 18338.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 1024] + - [17, 18881.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 256] + - [5, 16054.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 256] + - [5, 17882.0] + - - [54400, 256, 2, 12, 54400, 54400, 54400, 256] + - [19, 7712.0] + - - [950, 2048, 2, 256, 950, 950, 950, 2048] + - [33, 16602.0] + - - [55296, 256, 2, 3, 55296, 55296, 55296, 256] + - [14, 1972.0] + - - [60800, 256, 2, 12, 60800, 60800, 60800, 256] + - [43, 8391.0] + - - [51520, 256, 2, 3, 51520, 51520, 51520, 256] + - [15, 1755.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 1024] + - [5, 19048.0] + - - [55296, 256, 2, 12, 55296, 55296, 55296, 256] + - [13, 6814.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 256] + - [22, 15680.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 1024] + - [5, 18837.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 256] + - [33, 17161.0] + - - [60800, 256, 2, 3, 60800, 60800, 60800, 256] + - [35, 2308.0] + - - [1269, 256, 49, 256, 1269, 1269, 1269, 256] + - [33, 19226.0] + - - [1467, 256, 49, 256, 1467, 1467, 1467, 256] + - [1, 18736.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 256] + - [0, 16185.0] + - - [952, 256, 64, 256, 952, 952, 952, 256] + - [38, 18231.0] + - - [49152, 256, 2, 12, 49152, 49152, 49152, 256] + - [4, 5952.0] + - - [1449, 256, 49, 256, 1449, 1449, 1449, 256] + - [38, 18525.0] + - - [1278, 256, 49, 256, 1278, 1278, 1278, 256] + - [17, 19331.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 256] + - [19, 18285.0] + - - [736, 256, 64, 256, 736, 736, 736, 256] + - [1, 18626.0] + - - [1413, 256, 49, 256, 1413, 1413, 1413, 256] + - [38, 18166.0] + - - [600, 256, 64, 256, 600, 600, 600, 256] + - [5, 18255.0] + - - [1341, 256, 49, 256, 1341, 1341, 1341, 256] + - [5, 18884.0] + - - [1287, 256, 49, 256, 1287, 1287, 1287, 256] + - [38, 18207.0] + - - [1332, 256, 49, 256, 1332, 1332, 1332, 256] + - [38, 18888.0] + - - [1359, 256, 49, 256, 1359, 1359, 1359, 256] + - [5, 19144.0] + - - [1440, 256, 49, 256, 1440, 1440, 1440, 256] + - [22, 18603.0] + - - [1395, 256, 49, 256, 1395, 1395, 1395, 256] + - [47, 19337.0] + - - [1323, 256, 49, 256, 1323, 1323, 1323, 256] + - [22, 18765.0] + - - [1404, 256, 49, 256, 1404, 1404, 1404, 256] + - [38, 19759.0] + - - [1386, 256, 49, 256, 1386, 1386, 1386, 256] + - [22, 19475.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 1024] + - [17, 18353.0] + - - [1350, 256, 49, 256, 1350, 1350, 1350, 256] + - [47, 18806.0] + - - [1368, 256, 49, 256, 1368, 1368, 1368, 256] + - [38, 19271.0] + - - [49152, 256, 2, 3, 49152, 49152, 49152, 256] + - [9, 1497.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 256] + - [33, 16685.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 1024] + - [0, 18748.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 1024] + - [5, 18946.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 256] + - [5, 17670.0] + - - [690, 256, 64, 256, 690, 690, 690, 256] + - [38, 17396.0] + - - [54400, 256, 2, 3, 54400, 54400, 54400, 256] + - [11, 1960.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 1024] + - [22, 19015.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 256] + - [0, 16470.0] + - - [616, 256, 64, 256, 616, 616, 616, 256] + - [38, 18823.0] + - - [3008, 256, 64, 256, 3008, 3008, 3008, 256] + - [1, 19714.0] + - - [896, 256, 64, 256, 896, 896, 896, 256] + - [18, 19935.0] + - - [768, 256, 64, 256, 768, 768, 768, 256] + - [1, 19454.0] + - - [660, 256, 64, 256, 660, 660, 660, 256] + - [0, 16658.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 256] + - [3, 16542.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 1024] + - [9, 19007.0] + - - [800, 256, 64, 256, 800, 800, 800, 256] + - [34, 17884.0] + - - [1120, 256, 49, 256, 1120, 1120, 1120, 256] + - [18, 19219.0] + - - [2408, 256, 64, 256, 2408, 2408, 2408, 256] + - [21, 19559.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 256] + - [5, 19069.0] + - - [672, 256, 64, 256, 672, 672, 672, 256] + - [17, 17008.0] + - - [782, 256, 64, 256, 782, 782, 782, 256] + - [30, 17267.0] + - - [884, 256, 64, 256, 884, 884, 884, 256] + - [22, 19398.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 1024] + - [43, 19580.0] + - - [1064, 256, 49, 256, 1064, 1064, 1064, 256] + - [1, 18138.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 256] + - [19, 18569.0] + - - [704, 256, 64, 256, 704, 704, 704, 256] + - [0, 17805.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 1024] + - [5, 19250.0] + - - [3264, 256, 64, 256, 3264, 3264, 3264, 256] + - [37, 19926.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 1024] + - [17, 18665.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 256] + - [17, 17628.0] + - - [6440, 512, 1, 256, 6440, 6440, 6440, 512] + - [5, 17542.0] + - - [6912, 512, 1, 256, 6912, 6912, 6912, 512] + - [38, 19097.0] + - - [6800, 512, 1, 256, 6800, 6800, 6800, 512] + - [5, 18257.0] + - - [6800, 512, 1, 1024, 6800, 6800, 6800, 512] + - [22, 19391.0] + - - [6440, 512, 1, 1024, 6440, 6440, 6440, 512] + - [22, 18398.0] + - - [6912, 512, 1, 1024, 6912, 6912, 6912, 512] + - [38, 19898.0] + - - [1728, 1024, 1, 512, 1728, 1728, 1728, 1024] + - [33, 15811.0] + - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] + - [38, 16918.0] + - - [1610, 1024, 1, 512, 1610, 1610, 1610, 1024] + - [5, 17157.0] + - - [7600, 512, 1, 1024, 7600, 7600, 7600, 512] + - [5, 18904.0] + - - [6144, 512, 1, 1024, 6144, 6144, 6144, 512] + - [33, 18704.0] + - - [1700, 1024, 1, 512, 1700, 1700, 1700, 1024] + - [33, 15539.0] + - - [1728, 1024, 1, 2048, 1728, 1728, 1728, 1024] + - [33, 16347.0] + - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] + - [38, 17482.0] + - - [1700, 1024, 1, 2048, 1700, 1700, 1700, 1024] + - [33, 16077.0] + - - [1920, 25216, 1, 16384, 1920, 1920, 1920, 25216] + - [21, 20862.0] + - - [3840, 1920, 1, 16384, 3840, 3840, 3840, 1920] + - [4, 19894.0] + - - [1920, 3840, 1, 16384, 1920, 1920, 1920, 3840] + - [29, 19868.0] + - - [960, 1920, 1, 16384, 960, 960, 960, 1920] + - [33, 17113.0] + - - [1920, 2880, 1, 16384, 1920, 1920, 1920, 2880] + - [20, 19500.0] + - - [1920, 25216, 1, 4096, 1920, 1920, 1920, 25216] + - [37, 20856.0] + - - [3840, 1920, 1, 4096, 3840, 3840, 3840, 1920] + - [4, 19810.0] + - - [1920, 3840, 1, 4096, 1920, 1920, 1920, 3840] + - [37, 19798.0] + - - [960, 1920, 1, 4096, 960, 960, 960, 1920] + - [33, 17109.0] + - - [1920, 2880, 1, 4096, 1920, 1920, 1920, 2880] + - [0, 19463.0] + - - [1920, 25216, 1, 8192, 1920, 1920, 1920, 25216] + - [37, 20858.0] + - - [3840, 1920, 1, 8192, 3840, 3840, 3840, 1920] + - [4, 19857.0] + - - [1920, 3840, 1, 8192, 1920, 1920, 1920, 3840] + - [37, 19878.0] + - - [960, 1920, 1, 8192, 960, 960, 960, 1920] + - [42, 17159.0] + - - [1920, 2880, 1, 8192, 1920, 1920, 1920, 2880] + - [36, 19475.0] + - - [2304, 12672, 1, 16384, 2304, 2304, 2304, 12672] + - [37, 20789.0] + - - [2304, 2304, 1, 16384, 2304, 2304, 2304, 2304] + - [5, 20356.0] + - - [576, 2304, 1, 16384, 576, 576, 576, 2304] + - [5, 14873.0] + - - [2304, 1728, 1, 16384, 2304, 2304, 2304, 1728] + - [5, 19535.0] + - - [2304, 12672, 1, 4096, 2304, 2304, 2304, 12672] + - [21, 20758.0] + - - [2304, 2304, 1, 4096, 2304, 2304, 2304, 2304] + - [38, 20257.0] + - - [576, 2304, 1, 4096, 576, 576, 576, 2304] + - [38, 14777.0] + - - [2304, 1728, 1, 4096, 2304, 2304, 2304, 1728] + - [38, 19418.0] + - - [2304, 12672, 1, 8192, 2304, 2304, 2304, 12672] + - [21, 20789.0] + - - [2304, 2304, 1, 8192, 2304, 2304, 2304, 2304] + - [22, 20301.0] + - - [576, 2304, 1, 8192, 576, 576, 576, 2304] + - [22, 14816.0] + - - [2304, 1728, 1, 8192, 2304, 2304, 2304, 1728] + - [38, 19480.0] + - - [3072, 6400, 1, 4096, 3072, 3072, 3072, 6400] + - [37, 20497.0] + - - [1536, 3072, 1, 4096, 1536, 1536, 1536, 3072] + - [39, 20318.0] + - - [3072, 1536, 1, 4096, 3072, 3072, 3072, 1536] + - [6, 20314.0] + - - [384, 3072, 1, 4096, 384, 384, 384, 3072] + - [41, 19157.0] + - - [3072, 1152, 1, 4096, 3072, 3072, 3072, 1152] + - [5, 20099.0] + - - [3072, 6400, 1, 8192, 3072, 3072, 3072, 6400] + - [21, 20524.0] + - - [1536, 3072, 1, 8192, 1536, 1536, 1536, 3072] + - [23, 20419.0] + - - [3072, 1536, 1, 8192, 3072, 3072, 3072, 1536] + - [39, 20392.0] + - - [384, 3072, 1, 8192, 384, 384, 384, 3072] + - [41, 19204.0] + - - [3072, 1152, 1, 8192, 3072, 3072, 3072, 1152] + - [22, 20130.0] + - - [2048, 2048, 1, 4096, 2048, 2048, 2048, 2048] + - [0, 18613.0] + - - [2048, 2048, 1, 8, 2048, 2048, 2048, 2048] + - [0, 3631.0] + - - [2048, 29000, 1, 199, 2048, 2048, 2048, 29000] + - [1, 19965.0] + - - [2048, 29000, 1, 221, 2048, 2048, 2048, 29000] + - [21, 20037.0] + - - [2048, 29000, 1, 224, 2048, 2048, 2048, 29000] + - [21, 20131.0] + - - [2048, 29000, 1, 229, 2048, 2048, 2048, 29000] + - [21, 20135.0] + - - [2048, 29000, 1, 234, 2048, 2048, 2048, 29000] + - [21, 20128.0] + - - [2048, 29000, 1, 242, 2048, 2048, 2048, 29000] + - [21, 20187.0] + - - [2048, 29000, 1, 246, 2048, 2048, 2048, 29000] + - [21, 20220.0] + - - [2048, 29000, 1, 247, 2048, 2048, 2048, 29000] + - [21, 20241.0] + - - [2048, 29000, 1, 256, 2048, 2048, 2048, 29000] + - [21, 20362.0] + - - [2048, 29000, 1, 262, 2048, 2048, 2048, 29000] + - [21, 20282.0] + - - [2048, 29000, 1, 264, 2048, 2048, 2048, 29000] + - [21, 20291.0] + - - [2048, 29000, 1, 265, 2048, 2048, 2048, 29000] + - [21, 20273.0] + - - [2048, 29000, 1, 274, 2048, 2048, 2048, 29000] + - [21, 20309.0] + - - [2048, 29000, 1, 277, 2048, 2048, 2048, 29000] + - [21, 20296.0] + - - [2048, 29000, 1, 279, 2048, 2048, 2048, 29000] + - [21, 20340.0] + - - [2048, 29000, 1, 288, 2048, 2048, 2048, 29000] + - [21, 20456.0] + - - [2048, 29000, 1, 296, 2048, 2048, 2048, 29000] + - [21, 20401.0] + - - [2048, 29000, 1, 315, 2048, 2048, 2048, 29000] + - [4, 20402.0] + - - [2048, 29000, 1, 335, 2048, 2048, 2048, 29000] + - [4, 20459.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [37, 19597.0] + - - [4096, 2048, 1, 4096, 4096, 4096, 4096, 2048] + - [4, 19622.0] + - - [1024, 29000, 1, 2283, 1024, 1024, 1024, 29000] + - [21, 20703.0] + - - [1024, 29000, 1, 2296, 1024, 1024, 1024, 29000] + - [37, 20681.0] + - - [1024, 29000, 1, 2306, 1024, 1024, 1024, 29000] + - [21, 20720.0] + - - [1024, 29000, 1, 2309, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 2318, 1024, 1024, 1024, 29000] + - [37, 20684.0] + - - [1024, 29000, 1, 2320, 1024, 1024, 1024, 29000] + - [37, 20687.0] + - - [1024, 29000, 1, 2324, 1024, 1024, 1024, 29000] + - [37, 20688.0] + - - [1024, 29000, 1, 2325, 1024, 1024, 1024, 29000] + - [21, 20688.0] + - - [1024, 29000, 1, 2329, 1024, 1024, 1024, 29000] + - [37, 20686.0] + - - [1024, 29000, 1, 2338, 1024, 1024, 1024, 29000] + - [21, 20686.0] + - - [1024, 29000, 1, 2345, 1024, 1024, 1024, 29000] + - [21, 20688.0] + - - [1024, 29000, 1, 2350, 1024, 1024, 1024, 29000] + - [21, 20704.0] + - - [1024, 29000, 1, 2362, 1024, 1024, 1024, 29000] + - [37, 20690.0] + - - [1024, 29000, 1, 2366, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 2368, 1024, 1024, 1024, 29000] + - [37, 20694.0] + - - [1024, 29000, 1, 2374, 1024, 1024, 1024, 29000] + - [37, 20687.0] + - - [1024, 29000, 1, 2390, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 561, 1024, 1024, 1024, 29000] + - [21, 20391.0] + - - [1024, 29000, 1, 574, 1024, 1024, 1024, 29000] + - [21, 20397.0] + - - [1024, 29000, 1, 600, 1024, 1024, 1024, 29000] + - [21, 20433.0] + - - [1024, 29000, 1, 608, 1024, 1024, 1024, 29000] + - [21, 20432.0] + - - [1024, 29000, 1, 615, 1024, 1024, 1024, 29000] + - [21, 20434.0] + - - [1024, 29000, 1, 622, 1024, 1024, 1024, 29000] + - [21, 20426.0] + - - [1024, 29000, 1, 625, 1024, 1024, 1024, 29000] + - [21, 20410.0] + - - [1024, 29000, 1, 626, 1024, 1024, 1024, 29000] + - [21, 20442.0] + - - [1024, 29000, 1, 628, 1024, 1024, 1024, 29000] + - [21, 20443.0] + - - [1024, 29000, 1, 636, 1024, 1024, 1024, 29000] + - [21, 20436.0] + - - [1024, 29000, 1, 651, 1024, 1024, 1024, 29000] + - [21, 20427.0] + - - [1024, 29000, 1, 658, 1024, 1024, 1024, 29000] + - [21, 20451.0] + - - [1024, 29000, 1, 669, 1024, 1024, 1024, 29000] + - [21, 20452.0] + - - [1024, 29000, 1, 670, 1024, 1024, 1024, 29000] + - [21, 20451.0] + - - [1024, 29000, 1, 672, 1024, 1024, 1024, 29000] + - [21, 20470.0] + - - [1024, 29000, 1, 684, 1024, 1024, 1024, 29000] + - [21, 20450.0] + - - [1024, 29000, 1, 716, 1024, 1024, 1024, 29000] + - [21, 20469.0] + - - [1024, 29000, 1, 730, 1024, 1024, 1024, 29000] + - [21, 20476.0] + - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] + - [33, 18987.0] + - - [2560, 2560, 1, 2, 2560, 2560, 2560, 2560] + - [3, 980.0] + - - [2560, 29000, 1, 109, 2560, 2560, 2560, 29000] + - [0, 19111.0] + - - [2560, 29000, 1, 121, 2560, 2560, 2560, 29000] + - [15, 19117.0] + - - [2560, 29000, 1, 27, 2560, 2560, 2560, 29000] + - [32, 5335.0] + - - [2560, 29000, 1, 35, 2560, 2560, 2560, 29000] + - [17, 6709.0] + - - [2560, 29000, 1, 36, 2560, 2560, 2560, 29000] + - [8, 6896.0] + - - [2560, 29000, 1, 39, 2560, 2560, 2560, 29000] + - [20, 7463.0] + - - [2560, 29000, 1, 40, 2560, 2560, 2560, 29000] + - [16, 7625.0] + - - [2560, 29000, 1, 42, 2560, 2560, 2560, 29000] + - [16, 8005.0] + - - [2560, 29000, 1, 43, 2560, 2560, 2560, 29000] + - [11, 8168.0] + - - [2560, 29000, 1, 44, 2560, 2560, 2560, 29000] + - [28, 8354.0] + - - [2560, 29000, 1, 46, 2560, 2560, 2560, 29000] + - [2, 8731.0] + - - [2560, 29000, 1, 48, 2560, 2560, 2560, 29000] + - [17, 9072.0] + - - [2560, 29000, 1, 49, 2560, 2560, 2560, 29000] + - [20, 9295.0] + - - [2560, 29000, 1, 50, 2560, 2560, 2560, 29000] + - [20, 9482.0] + - - [2560, 29000, 1, 51, 2560, 2560, 2560, 29000] + - [3, 9653.0] + - - [2560, 29000, 1, 53, 2560, 2560, 2560, 29000] + - [11, 10008.0] + - - [2560, 29000, 1, 54, 2560, 2560, 2560, 29000] + - [8, 10168.0] + - - [2560, 29000, 1, 55, 2560, 2560, 2560, 29000] + - [13, 10367.0] + - - [2560, 29000, 1, 56, 2560, 2560, 2560, 29000] + - [3, 10557.0] + - - [2560, 29000, 1, 57, 2560, 2560, 2560, 29000] + - [13, 10721.0] + - - [2560, 29000, 1, 58, 2560, 2560, 2560, 29000] + - [13, 10913.0] + - - [2560, 29000, 1, 59, 2560, 2560, 2560, 29000] + - [2, 11095.0] + - - [2560, 29000, 1, 61, 2560, 2560, 2560, 29000] + - [13, 11472.0] + - - [2560, 29000, 1, 63, 2560, 2560, 2560, 29000] + - [2, 11792.0] + - - [2560, 29000, 1, 65, 2560, 2560, 2560, 29000] + - [2, 12136.0] + - - [2560, 29000, 1, 66, 2560, 2560, 2560, 29000] + - [2, 12345.0] + - - [2560, 29000, 1, 67, 2560, 2560, 2560, 29000] + - [13, 12466.0] + - - [2560, 29000, 1, 69, 2560, 2560, 2560, 29000] + - [2, 12871.0] + - - [2560, 29000, 1, 70, 2560, 2560, 2560, 29000] + - [13, 13012.0] + - - [2560, 29000, 1, 71, 2560, 2560, 2560, 29000] + - [2, 13217.0] + - - [2560, 29000, 1, 73, 2560, 2560, 2560, 29000] + - [2, 13558.0] + - - [2560, 29000, 1, 74, 2560, 2560, 2560, 29000] + - [2, 13762.0] + - - [2560, 29000, 1, 75, 2560, 2560, 2560, 29000] + - [0, 13831.0] + - - [2560, 29000, 1, 77, 2560, 2560, 2560, 29000] + - [17, 14093.0] + - - [2560, 29000, 1, 78, 2560, 2560, 2560, 29000] + - [2, 14297.0] + - - [2560, 29000, 1, 80, 2560, 2560, 2560, 29000] + - [2, 14787.0] + - - [2560, 29000, 1, 81, 2560, 2560, 2560, 29000] + - [5, 14846.0] + - - [2560, 29000, 1, 82, 2560, 2560, 2560, 29000] + - [5, 15008.0] + - - [2560, 29000, 1, 83, 2560, 2560, 2560, 29000] + - [5, 15253.0] + - - [2560, 29000, 1, 84, 2560, 2560, 2560, 29000] + - [5, 15391.0] + - - [2560, 29000, 1, 88, 2560, 2560, 2560, 29000] + - [0, 16042.0] + - - [2560, 29000, 1, 89, 2560, 2560, 2560, 29000] + - [5, 16175.0] + - - [2560, 29000, 1, 90, 2560, 2560, 2560, 29000] + - [5, 16289.0] + - - [2560, 29000, 1, 92, 2560, 2560, 2560, 29000] + - [5, 16598.0] + - - [2560, 29000, 1, 95, 2560, 2560, 2560, 29000] + - [15, 16915.0] + - - [2560, 29000, 1, 98, 2560, 2560, 2560, 29000] + - [5, 17675.0] + - - [2560, 4096, 1, 1024, 2560, 2560, 2560, 4096] + - [37, 20303.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 2560] + - [46, 20348.0] + - - [1024, 3072, 1, 32768, 1024, 1024, 1024, 3072] + - [20, 18912.0] + - - [1024, 4096, 1, 32768, 1024, 1024, 1024, 4096] + - [20, 18660.0] + - - [1024, 50304, 1, 32768, 1024, 1024, 1024, 50304] + - [6, 20559.0] + - - [4096, 1024, 1, 32768, 4096, 4096, 4096, 1024] + - [45, 18639.0] + - - [1024, 128, 24, 1024, 1024, 1024, 1024, 128] + - [17, 18566.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [33, 18622.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [52, 16310.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [54, 18032.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [48, 18126.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [58, 15943.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [51, 16054.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [51, 16140.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [53, 16310.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [49, 17941.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [57, 18211.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [55, 18330.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [54, 17952.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [55, 18345.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [56, 17923.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [55, 18345.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [56, 18184.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [50, 17448.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [55, 16390.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [55, 14661.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [48, 17929.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [48, 18066.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [48, 18058.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [48, 18111.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [48, 18006.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [50, 16473.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [54, 18262.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [48, 18090.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [54, 18111.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [48, 18099.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [48, 18100.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [48, 18089.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [48, 18120.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [48, 18127.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [54, 18099.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [48, 18189.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [88, 15157.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [63, 14094.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [74, 17007.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [96, 16838.0] + - - [768, 768, 1, 16, 768, 768, 768, 768] + - [92, 3775.0] + - - [768, 768, 1, 320, 768, 768, 768, 768] + - [63, 13579.0] + - - [768, 768, 1, 4096, 768, 768, 768, 768] + - [116, 16815.0] + - - [768, 768, 1, 32, 768, 768, 768, 768] + - [95, 5519.0] + - - [768, 768, 1, 640, 768, 768, 768, 768] + - [63, 15063.0] + - - [768, 768, 1, 64, 768, 768, 768, 768] + - [92, 7316.0] + - - [768, 768, 1, 1280, 768, 768, 768, 768] + - [63, 15975.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [96, 17006.0] + - - [1024, 1024, 1, 120, 1024, 1024, 1024, 1024] + - [63, 12710.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [92, 324.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [107, 4810.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [107, 1279.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [107, 1829.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [61, 11367.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [63, 16798.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [87, 16720.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [62, 11001.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [63, 16648.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [88, 13981.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 448] + - [61, 12883.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [103, 17547.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [63, 16495.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [61, 12290.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [65, 10850.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [63, 16272.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [87, 9655.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [60, 7682.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [108, 8169.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [63, 13036.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [110, 12956.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [63, 12494.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [86, 14884.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [62, 7780.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [104, 11211.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [110, 15225.0] + - - [704, 1024, 1, 128, 704, 704, 704, 1024] + - [68, 10730.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [87, 12530.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [118, 15926.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [87, 9565.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [87, 12949.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [71, 15899.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 256] + - [63, 11012.0] + - - [448, 2944, 1, 128, 448, 448, 448, 2944] + - [86, 13998.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [89, 9179.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [62, 9384.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [109, 10970.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [65, 9756.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 448] + - [89, 8247.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [96, 14934.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [65, 9487.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [63, 12486.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [87, 9571.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 256] + - [63, 11514.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [109, 14485.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [62, 9050.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [86, 12581.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [72, 11286.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [87, 14144.0] + - - [448, 1856, 1, 128, 448, 448, 448, 1856] + - [86, 11852.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [63, 14231.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [63, 12880.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [71, 13870.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [64, 15537.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [74, 14883.0] + - - [704, 1856, 1, 128, 704, 704, 704, 1856] + - [61, 13937.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [71, 13051.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [72, 10893.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [63, 10212.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [63, 12740.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [88, 13034.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [63, 15253.0] + - - [448, 2368, 1, 128, 448, 448, 448, 2368] + - [109, 12982.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [63, 13851.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [61, 16337.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [62, 9205.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [84, 14074.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [61, 10844.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [85, 9328.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [88, 12899.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [87, 16187.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [63, 12414.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [96, 16558.0] + - - [448, 1024, 1, 128, 448, 448, 448, 1024] + - [62, 8270.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [61, 16044.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 64] + - [62, 6973.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [86, 12419.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [88, 13773.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [61, 10450.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [111, 16978.0] + - - [256, 1856, 1, 128, 256, 256, 256, 1856] + - [63, 8494.0] + - - [448, 1408, 1, 128, 448, 448, 448, 1408] + - [68, 9477.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [87, 12161.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [85, 8026.0] + - - [704, 1408, 1, 128, 704, 704, 704, 1408] + - [61, 12318.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 448] + - [61, 14022.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [62, 7907.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [69, 14583.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [87, 12030.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [112, 9539.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [96, 17021.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [61, 15845.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [109, 11861.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [87, 14892.0] + - - [256, 2368, 1, 128, 256, 256, 256, 2368] + - [63, 9237.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [86, 15040.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [110, 12805.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [61, 10528.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [87, 16077.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [87, 10962.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [88, 14636.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [61, 15040.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [63, 15422.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [63, 12109.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [63, 15375.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 448] + - [86, 11932.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [60, 8948.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [61, 9403.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [65, 9491.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [63, 11854.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [91, 16097.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [63, 10194.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [60, 9328.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [87, 11270.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [111, 16957.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [88, 15399.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 256] + - [62, 7715.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [85, 10611.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [60, 7698.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [87, 12235.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [72, 11312.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [87, 16725.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [63, 15157.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [63, 12897.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [114, 16520.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [85, 8121.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 704] + - [63, 10680.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [87, 13130.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [61, 11989.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 448] + - [62, 9477.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [87, 14883.0] + - - [704, 448, 1, 128, 704, 704, 704, 448] + - [62, 6889.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [110, 12835.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [62, 9348.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [68, 11049.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [61, 14022.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [111, 13047.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [63, 12446.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [91, 15220.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [111, 15492.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [61, 15209.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [61, 14508.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [89, 9200.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [77, 12670.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [112, 9769.0] + - - [64, 5888, 1, 128, 64, 64, 64, 5888] + - [62, 7959.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [63, 12986.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [108, 8042.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [72, 11308.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [87, 12871.0] + - - [704, 704, 1, 128, 704, 704, 704, 704] + - [62, 8885.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [86, 11525.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [62, 6643.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [90, 9677.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [93, 16368.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [88, 12925.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [91, 12332.0] + - - [256, 3584, 1, 128, 256, 256, 256, 3584] + - [63, 11582.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 256] + - [61, 13331.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 256] + - [62, 9129.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [63, 10707.0] + - - [256, 2944, 1, 128, 256, 256, 256, 2944] + - [63, 11165.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [65, 10856.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [61, 14508.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [93, 17554.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 704] + - [86, 13799.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [61, 9650.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [63, 10985.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [63, 11804.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [62, 8247.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [63, 10673.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [110, 16519.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [86, 16116.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 256] + - [62, 8494.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [61, 10986.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [61, 10106.0] + - - [64, 6784, 1, 128, 64, 64, 64, 6784] + - [73, 8008.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [117, 10854.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [62, 8247.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [60, 9047.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [110, 15223.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [89, 10563.0] + - - [64, 5056, 1, 128, 64, 64, 64, 5056] + - [62, 6949.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 64] + - [108, 7656.0] + - - [448, 704, 1, 128, 448, 448, 448, 704] + - [62, 6819.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 704] + - [63, 12342.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [110, 12221.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [69, 14800.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [88, 14556.0] + - - [256, 1408, 1, 128, 256, 256, 256, 1408] + - [108, 7588.0] + - - [256, 4288, 1, 128, 256, 256, 256, 4288] + - [63, 13537.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [63, 11038.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [88, 13385.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [72, 10854.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 64] + - [62, 7894.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [61, 13227.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [108, 8960.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [93, 14495.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [64, 12388.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [86, 16310.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [61, 10177.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [72, 10898.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [62, 6598.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [110, 14919.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [63, 14533.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [63, 12309.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [110, 13407.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [68, 10983.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [109, 13657.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [108, 9028.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [61, 16180.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [88, 13777.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [61, 14041.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [87, 14859.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [65, 9535.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [110, 16480.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [59, 9674.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [90, 10273.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [90, 10268.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [59, 6134.0] + - - [64, 1536, 64, 384, 64, 64, 64, 1536] + - [98, 12118.0] + - - [64, 1536, 64, 256, 64, 64, 64, 1536] + - [109, 18228.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [61, 10734.0] + - - [1024, 1024, 1, 3975, 1024, 1024, 1024, 1024] + - [93, 16909.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [61, 14091.0] + - - [64, 102, 624, 100, 64, 64, 64, 102] + - [61, 11974.0] + - - [64, 112, 576, 111, 64, 64, 64, 112] + - [86, 12859.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [109, 11892.0] + - - [64, 133, 480, 135, 64, 64, 64, 133] + - [61, 11944.0] + - - [1024, 1024, 1, 4026, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 160, 400, 159, 64, 64, 64, 160] + - [61, 14421.0] + - - [1024, 1024, 1, 3780, 1024, 1024, 1024, 1024] + - [96, 16963.0] + - - [64, 228, 272, 232, 64, 64, 64, 228] + - [61, 15644.0] + - - [1024, 1024, 1, 3822, 1024, 1024, 1024, 1024] + - [96, 16971.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [61, 9713.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [61, 14057.0] + - - [64, 135, 480, 134, 64, 64, 64, 135] + - [61, 11839.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [109, 11684.0] + - - [1024, 1024, 1, 3942, 1024, 1024, 1024, 1024] + - [96, 16992.0] + - - [1024, 1024, 1, 3861, 1024, 1024, 1024, 1024] + - [96, 16992.0] + - - [1024, 1024, 1, 4000, 1024, 1024, 1024, 1024] + - [96, 17078.0] + - - [1024, 1024, 1, 3870, 1024, 1024, 1024, 1024] + - [96, 16983.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [61, 8163.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [109, 11860.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [61, 15849.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [109, 12985.0] + - - [1024, 1024, 1, 4032, 1024, 1024, 1024, 1024] + - [74, 17048.0] + - - [1024, 1024, 1, 4012, 1024, 1024, 1024, 1024] + - [96, 17002.0] + - - [1024, 1024, 1, 3681, 1024, 1024, 1024, 1024] + - [96, 16957.0] + - - [1024, 1024, 1, 3927, 1024, 1024, 1024, 1024] + - [96, 16978.0] + - - [1024, 1024, 1, 3894, 1024, 1024, 1024, 1024] + - [96, 16989.0] + - - [64, 132, 480, 135, 64, 64, 64, 132] + - [61, 11539.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [109, 12082.0] + - - [1024, 1024, 1, 3876, 1024, 1024, 1024, 1024] + - [96, 16981.0] + - - [64, 84, 752, 85, 64, 64, 64, 84] + - [61, 10438.0] + - - [1024, 1024, 1, 4050, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [109, 11826.0] + - - [64, 99, 624, 102, 64, 64, 64, 99] + - [109, 11854.0] + - - [64, 143, 432, 148, 64, 64, 64, 143] + - [86, 12600.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 1024, 1024] + - [96, 17040.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [61, 14295.0] + - - [64, 148, 432, 147, 64, 64, 64, 148] + - [86, 13082.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [118, 16982.0] + - - [64, 123, 528, 122, 64, 64, 64, 123] + - [86, 14086.0] + - - [64, 102, 624, 101, 64, 64, 64, 102] + - [86, 11918.0] + - - [1024, 1024, 1, 3978, 1024, 1024, 1024, 1024] + - [96, 16994.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [61, 14315.0] + - - [1024, 1024, 1, 3995, 1024, 1024, 1024, 1024] + - [96, 16971.0] + - - [64, 132, 480, 134, 64, 64, 64, 132] + - [61, 11843.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [61, 12962.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [109, 11767.0] + - - [1024, 1024, 1, 3977, 1024, 1024, 1024, 1024] + - [96, 16996.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [86, 13167.0] + - - [64, 159, 400, 162, 64, 64, 64, 159] + - [61, 14054.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [109, 13787.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [109, 15608.0] + - - [1024, 1024, 1, 3925, 1024, 1024, 1024, 1024] + - [96, 16979.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [86, 11247.0] + - - [1024, 1024, 1, 3956, 1024, 1024, 1024, 1024] + - [96, 16987.0] + - - [1024, 1024, 1, 3976, 1024, 1024, 1024, 1024] + - [96, 16989.0] + - - [64, 111, 576, 112, 64, 64, 64, 111] + - [86, 13117.0] + - - [64, 100, 624, 102, 64, 64, 64, 100] + - [61, 11848.0] + - - [1024, 1024, 1, 3955, 1024, 1024, 1024, 1024] + - [96, 17010.0] + - - [1024, 1024, 1, 4030, 1024, 1024, 1024, 1024] + - [96, 17015.0] + - - [1024, 1024, 1, 3906, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 101, 624, 102, 64, 64, 64, 101] + - [61, 11816.0] + - - [1024, 1024, 1, 3796, 1024, 1024, 1024, 1024] + - [96, 16986.0] + - - [1024, 1024, 1, 3859, 1024, 1024, 1024, 1024] + - [96, 16982.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [109, 8829.0] + - - [1024, 1024, 1, 3860, 1024, 1024, 1024, 1024] + - [96, 16966.0] + - - [1024, 1024, 1, 4005, 1024, 1024, 1024, 1024] + - [96, 17004.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [86, 10423.0] + - - [1024, 1024, 1, 3990, 1024, 1024, 1024, 1024] + - [96, 16998.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [61, 11802.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [61, 9935.0] + - - [1024, 1024, 1, 3999, 1024, 1024, 1024, 1024] + - [96, 16993.0] + - - [1024, 1024, 1, 4020, 1024, 1024, 1024, 1024] + - [96, 16990.0] + - - [1024, 1024, 1, 3939, 1024, 1024, 1024, 1024] + - [96, 17001.0] + - - [64, 77, 816, 78, 64, 64, 64, 77] + - [109, 9705.0] + - - [1024, 1024, 1, 4059, 1024, 1024, 1024, 1024] + - [96, 17017.0] + - - [1024, 1024, 1, 3944, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [61, 13166.0] + - - [1024, 1024, 1, 3720, 1024, 1024, 1024, 1024] + - [96, 16986.0] + - - [1024, 1024, 1, 3910, 1024, 1024, 1024, 1024] + - [96, 16997.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [86, 12399.0] + - - [64, 92, 688, 93, 64, 64, 64, 92] + - [109, 11280.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [86, 11877.0] + - - [1024, 1024, 1, 3969, 1024, 1024, 1024, 1024] + - [96, 17006.0] + - - [1024, 1024, 1, 3948, 1024, 1024, 1024, 1024] + - [96, 16990.0] + - - [1024, 1024, 1, 3996, 1024, 1024, 1024, 1024] + - [96, 17009.0] + - - [1024, 1024, 1, 3900, 1024, 1024, 1024, 1024] + - [96, 16998.0] + - - [1024, 1024, 1, 3640, 1024, 1024, 1024, 1024] + - [96, 16964.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [86, 12977.0] + - - [1024, 1024, 1, 3751, 1024, 1024, 1024, 1024] + - [96, 16978.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [86, 16004.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [61, 10474.0] + - - [1024, 1024, 1, 3712, 1024, 1024, 1024, 1024] + - [96, 17051.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [61, 12807.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [86, 16890.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [61, 15865.0] + - - [64, 192, 36, 25088, 64, 64, 64, 192] + - [73, 10415.0] + - - [128, 128, 64, 25, 128, 128, 128, 128] + - [75, 5160.0] + - - [64, 192, 64, 3200, 64, 64, 64, 192] + - [79, 11264.0] + - - [64, 128, 64, 23104, 64, 64, 64, 128] + - [76, 9224.0] + - - [128, 128, 64, 1600, 128, 128, 128, 128] + - [111, 16400.0] + - - [80, 192, 64, 4608, 80, 80, 80, 192] + - [123, 10407.0] + - - [64, 128, 36, 30, 64, 64, 64, 128] + - [95, 2528.0] + - - [64, 128, 64, 11552, 64, 64, 64, 128] + - [102, 9564.0] + - - [128, 192, 64, 946, 128, 128, 128, 192] + - [63, 16351.0] + - - [64, 192, 64, 12800, 64, 64, 64, 192] + - [113, 9738.0] + - - [224, 224, 64, 128, 224, 224, 224, 224] + - [86, 12255.0] + - - [128, 128, 64, 3360, 128, 128, 128, 128] + - [103, 15029.0] + - - [128, 128, 64, 420, 128, 128, 128, 128] + - [88, 14690.0] + - - [64, 128, 64, 361, 64, 64, 64, 128] + - [61, 11859.0] + - - [64, 128, 36, 53824, 64, 64, 64, 128] + - [124, 9545.0] + - - [128, 160, 36, 512, 128, 128, 128, 160] + - [64, 10439.0] + - - [147, 64, 36, 18816, 147, 147, 147, 64] + - [69, 9291.0] + - - [96, 128, 64, 946, 96, 96, 96, 128] + - [111, 11610.0] + - - [128, 128, 64, 50, 128, 128, 128, 128] + - [62, 8244.0] + - - [160, 224, 36, 128, 160, 160, 160, 224] + - [109, 11039.0] + - - [192, 224, 64, 1152, 192, 192, 192, 224] + - [109, 15408.0] + - - [128, 128, 36, 784, 128, 128, 128, 128] + - [87, 15272.0] + - - [96, 128, 64, 288, 96, 96, 96, 128] + - [85, 10623.0] + - - [128, 128, 64, 400, 128, 128, 128, 128] + - [111, 15353.0] + - - [128, 128, 64, 800, 128, 128, 128, 128] + - [111, 16151.0] + - - [96, 128, 36, 512, 96, 96, 96, 128] + - [61, 10554.0] + - - [96, 128, 64, 800, 96, 96, 96, 128] + - [103, 11764.0] + - - [192, 224, 64, 128, 192, 192, 192, 224] + - [109, 13489.0] + - - [128, 128, 64, 288, 128, 128, 128, 128] + - [88, 14660.0] + - - [96, 208, 36, 512, 96, 96, 96, 208] + - [63, 10235.0] + - - [64, 128, 36, 1568, 64, 64, 64, 128] + - [62, 12331.0] + - - [192, 192, 36, 512, 192, 192, 192, 192] + - [109, 15736.0] + - - [128, 128, 36, 512, 128, 128, 128, 128] + - [110, 14519.0] + - - [96, 208, 64, 1152, 96, 96, 96, 208] + - [116, 10614.0] + - - [128, 192, 64, 3200, 128, 128, 128, 192] + - [77, 15101.0] + - - [160, 160, 64, 288, 160, 160, 160, 160] + - [86, 11898.0] + - - [128, 128, 36, 440, 128, 128, 128, 128] + - [110, 14197.0] + - - [96, 128, 36, 1568, 96, 96, 96, 128] + - [110, 11829.0] + - - [112, 224, 36, 2048, 112, 112, 112, 224] + - [116, 14051.0] + - - [128, 128, 36, 7040, 128, 128, 128, 128] + - [125, 13779.0] + - - [128, 128, 36, 1568, 128, 128, 128, 128] + - [63, 15649.0] + - - [160, 224, 64, 128, 160, 160, 160, 224] + - [86, 10700.0] + - - [192, 224, 36, 2592, 192, 192, 192, 224] + - [61, 15615.0] + - - [64, 128, 64, 2888, 64, 64, 64, 128] + - [66, 13656.0] + - - [64, 128, 36, 480, 64, 64, 64, 128] + - [62, 10991.0] + - - [147, 64, 64, 9702, 147, 147, 147, 64] + - [109, 8768.0] + - - [64, 192, 64, 3698, 64, 64, 64, 192] + - [101, 10746.0] + - - [73, 192, 64, 10439, 73, 73, 73, 192] + - [81, 9604.0] + - - [128, 128, 36, 880, 128, 128, 128, 128] + - [87, 14796.0] + - - [192, 224, 36, 128, 192, 192, 192, 224] + - [86, 13427.0] + - - [64, 128, 36, 12544, 64, 64, 64, 128] + - [124, 9482.0] + - - [160, 160, 36, 512, 160, 160, 160, 160] + - [86, 10518.0] + - - [128, 128, 36, 3136, 128, 128, 128, 128] + - [71, 16548.0] + - - [112, 224, 36, 512, 112, 112, 112, 224] + - [61, 12655.0] + - - [128, 128, 36, 49, 128, 128, 128, 128] + - [67, 6229.0] + - - [112, 224, 64, 1152, 112, 112, 112, 224] + - [93, 13282.0] + - - [128, 192, 36, 1568, 128, 128, 128, 192] + - [63, 13779.0] + - - [128, 192, 36, 512, 128, 128, 128, 192] + - [87, 13115.0] + - - [192, 192, 64, 288, 192, 192, 192, 192] + - [61, 17797.0] + - - [96, 208, 64, 242, 96, 96, 96, 208] + - [71, 9640.0] + - - [64, 128, 64, 5776, 64, 64, 64, 128] + - [83, 9402.0] + - - [128, 192, 64, 288, 128, 128, 128, 192] + - [110, 15150.0] + - - [96, 128, 36, 6272, 96, 96, 96, 128] + - [103, 12147.0] + - - [96, 128, 64, 3200, 96, 96, 96, 128] + - [125, 12343.0] + - - [128, 192, 64, 800, 128, 128, 128, 192] + - [110, 16312.0] + - - [64, 128, 64, 10, 64, 64, 64, 128] + - [82, 1736.0] + - - [96, 208, 64, 288, 96, 96, 96, 208] + - [93, 9883.0] + - - [64, 128, 64, 160, 64, 64, 64, 128] + - [61, 9642.0] + - - [128, 128, 64, 1568, 128, 128, 128, 128] + - [88, 16578.0] + - - [112, 224, 64, 242, 112, 112, 112, 224] + - [61, 11971.0] + - - [160, 192, 64, 288, 160, 160, 160, 192] + - [61, 14662.0] + - - [128, 160, 64, 288, 128, 128, 128, 160] + - [110, 12981.0] + - - [128, 128, 64, 210, 128, 128, 128, 128] + - [64, 13884.0] + - - [73, 192, 36, 23360, 73, 73, 73, 192] + - [108, 8693.0] + - - [160, 192, 36, 512, 160, 160, 160, 192] + - [109, 12633.0] + - - [64, 128, 64, 722, 64, 64, 64, 128] + - [61, 13116.0] + - - [112, 224, 64, 288, 112, 112, 112, 224] + - [61, 12220.0] + - - [64, 192, 36, 6272, 64, 64, 64, 192] + - [73, 10298.0] + - - [64, 128, 36, 6272, 64, 64, 64, 128] + - [80, 11172.0] + - - [128, 128, 36, 3200, 128, 128, 128, 128] + - [88, 16193.0] + - - [128, 128, 36, 392, 128, 128, 128, 128] + - [87, 13945.0] + - - [80, 192, 36, 10368, 80, 80, 80, 192] + - [97, 9554.0] + - - [224, 224, 36, 128, 224, 224, 224, 224] + - [61, 11537.0] + - - [64, 128, 36, 784, 64, 64, 64, 128] + - [62, 11749.0] + - - [128, 128, 64, 200, 128, 128, 128, 128] + - [63, 13907.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [63, 17899.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [109, 17746.0] + - - [289, 1792, 1, 320, 289, 289, 289, 1792] + - [61, 10720.0] + - - [1001, 1024, 1, 32, 1001, 1001, 1001, 1024] + - [60, 6694.0] + - - [784, 400, 1, 32, 784, 784, 784, 400] + - [107, 3323.0] + - - [64, 1536, 32, 256, 64, 64, 64, 1536] + - [98, 16862.0] + - - [289, 2592, 1, 384, 289, 289, 289, 2592] + - [86, 12650.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [86, 17814.0] + - - [289, 2016, 1, 256, 289, 289, 289, 2016] + - [62, 8815.0] + - - [64, 1536, 32, 384, 64, 64, 64, 1536] + - [61, 17188.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [86, 16397.0] + - - [289, 3456, 1, 384, 289, 289, 289, 3456] + - [109, 13639.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [119, 16289.0] + - - [729, 1600, 1, 192, 729, 729, 729, 1600] + - [63, 12856.0] + - - [289, 1344, 1, 192, 289, 289, 289, 1344] + - [62, 8417.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [98, 17796.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [61, 15892.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [119, 15612.0] + - - [289, 1792, 1, 256, 289, 289, 289, 1792] + - [61, 10293.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [61, 17575.0] + - - [5329, 64, 128, 80, 5329, 5329, 5329, 64] + - [59, 6942.0] + - - [64, 1280, 128, 448, 64, 64, 64, 1280] + - [124, 10988.0] + - - [64, 2048, 128, 192, 64, 64, 64, 2048] + - [98, 9388.0] + - - [64, 1280, 128, 384, 64, 64, 64, 1280] + - [115, 10767.0] + - - [64, 1280, 128, 320, 64, 64, 64, 1280] + - [90, 10340.0] + - - [64, 1280, 128, 192, 64, 64, 64, 1280] + - [98, 11135.0] + - - [256, 4096, 1, 6400, 256, 256, 256, 4096] + - [127, 16977.0] + - - [512, 2048, 1, 3427, 512, 512, 512, 2048] + - [96, 16880.0] + - - [512, 2048, 1, 3552, 512, 512, 512, 2048] + - [96, 17007.0] + - - [512, 2048, 1, 3840, 512, 512, 512, 2048] + - [96, 17064.0] + - - [2048, 512, 1, 3427, 2048, 2048, 2048, 512] + - [118, 16888.0] + - - [2048, 512, 1, 3452, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3472, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3475, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [61, 10126.0] + - - [64, 64, 496, 65, 64, 64, 64, 64] + - [109, 10986.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [109, 7418.0] + - - [64, 71, 448, 71, 64, 64, 64, 71] + - [61, 7567.0] + - - [64, 77, 408, 77, 64, 64, 64, 77] + - [86, 7843.0] + - - [64, 77, 408, 78, 64, 64, 64, 77] + - [61, 7993.0] + - - [64, 78, 408, 78, 64, 64, 64, 78] + - [86, 8032.0] + - - [64, 85, 376, 85, 64, 64, 64, 85] + - [86, 8383.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [109, 9298.0] + - - [64, 112, 288, 112, 64, 64, 64, 112] + - [61, 11549.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [61, 11896.0] + - - [64, 123, 264, 122, 64, 64, 64, 123] + - [61, 11903.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [61, 11780.0] + - - [64, 134, 240, 134, 64, 64, 64, 134] + - [61, 10140.0] + - - [64, 135, 240, 134, 64, 64, 64, 135] + - [61, 10261.0] + - - [64, 135, 240, 135, 64, 64, 64, 135] + - [109, 10337.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [61, 18210.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [61, 18320.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [61, 18111.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [119, 13459.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [86, 18474.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [119, 10848.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [115, 10971.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [90, 11227.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [61, 17084.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [90, 10226.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [59, 7991.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [61, 12607.0] + - - [512, 2048, 1, 2790, 512, 512, 512, 2048] + - [118, 16856.0] + - - [512, 2048, 1, 2864, 512, 512, 512, 2048] + - [118, 16887.0] + - - [512, 2048, 1, 3092, 512, 512, 512, 2048] + - [118, 16906.0] + - - [512, 2048, 1, 3113, 512, 512, 512, 2048] + - [118, 16911.0] + - - [512, 2048, 1, 3137, 512, 512, 512, 2048] + - [96, 16896.0] + - - [512, 2048, 1, 3165, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3166, 512, 512, 512, 2048] + - [118, 16939.0] + - - [512, 2048, 1, 3194, 512, 512, 512, 2048] + - [118, 16923.0] + - - [512, 2048, 1, 3219, 512, 512, 512, 2048] + - [118, 16934.0] + - - [512, 2048, 1, 3222, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3234, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3237, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3242, 512, 512, 512, 2048] + - [118, 16941.0] + - - [512, 2048, 1, 3246, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3249, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3251, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3257, 512, 512, 512, 2048] + - [118, 16930.0] + - - [512, 2048, 1, 3262, 512, 512, 512, 2048] + - [118, 16933.0] + - - [512, 2048, 1, 3268, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3282, 512, 512, 512, 2048] + - [96, 16936.0] + - - [512, 2048, 1, 3286, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3287, 512, 512, 512, 2048] + - [118, 16934.0] + - - [512, 2048, 1, 3293, 512, 512, 512, 2048] + - [96, 16933.0] + - - [512, 2048, 1, 3297, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3307, 512, 512, 512, 2048] + - [118, 16924.0] + - - [512, 2048, 1, 3314, 512, 512, 512, 2048] + - [96, 16956.0] + - - [512, 2048, 1, 3315, 512, 512, 512, 2048] + - [96, 16940.0] + - - [512, 2048, 1, 3319, 512, 512, 512, 2048] + - [96, 16950.0] + - - [512, 2048, 1, 3322, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3323, 512, 512, 512, 2048] + - [96, 16938.0] + - - [512, 2048, 1, 3324, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3325, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3327, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3329, 512, 512, 512, 2048] + - [96, 16927.0] + - - [512, 2048, 1, 3332, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3336, 512, 512, 512, 2048] + - [118, 16941.0] + - - [512, 2048, 1, 3339, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3342, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3344, 512, 512, 512, 2048] + - [96, 16957.0] + - - [512, 2048, 1, 3358, 512, 512, 512, 2048] + - [118, 16954.0] + - - [512, 2048, 1, 3360, 512, 512, 512, 2048] + - [96, 17029.0] + - - [512, 2048, 1, 3364, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3365, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3369, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3371, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3374, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3376, 512, 512, 512, 2048] + - [118, 16959.0] + - - [512, 2048, 1, 3377, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3378, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3381, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3382, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3383, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3384, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3385, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3386, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3388, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3390, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3391, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3396, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3399, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3402, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3410, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3412, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3414, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3415, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3418, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3420, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3422, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3425, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3426, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3428, 512, 512, 512, 2048] + - [96, 16955.0] + - - [512, 2048, 1, 3430, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3431, 512, 512, 512, 2048] + - [118, 16959.0] + - - [512, 2048, 1, 3432, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3438, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3439, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3440, 512, 512, 512, 2048] + - [118, 16971.0] + - - [512, 2048, 1, 3443, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3445, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3447, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3448, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3450, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3451, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3452, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3453, 512, 512, 512, 2048] + - [96, 16951.0] + - - [512, 2048, 1, 3455, 512, 512, 512, 2048] + - [96, 16934.0] + - - [512, 2048, 1, 3456, 512, 512, 512, 2048] + - [96, 17029.0] + - - [512, 2048, 1, 3457, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3458, 512, 512, 512, 2048] + - [96, 16947.0] + - - [512, 2048, 1, 3459, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3460, 512, 512, 512, 2048] + - [118, 16936.0] + - - [512, 2048, 1, 3461, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3462, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3466, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3467, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3468, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3470, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3471, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3472, 512, 512, 512, 2048] + - [118, 16954.0] + - - [512, 2048, 1, 3475, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3476, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3477, 512, 512, 512, 2048] + - [96, 16958.0] + - - [512, 2048, 1, 3478, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3479, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3480, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3481, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3483, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3484, 512, 512, 512, 2048] + - [118, 16970.0] + - - [512, 2048, 1, 3487, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3489, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3490, 512, 512, 512, 2048] + - [96, 16953.0] + - - [512, 2048, 1, 3491, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3493, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3494, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3495, 512, 512, 512, 2048] + - [118, 16938.0] + - - [512, 2048, 1, 3497, 512, 512, 512, 2048] + - [96, 16950.0] + - - [512, 2048, 1, 3498, 512, 512, 512, 2048] + - [96, 16967.0] + - - [512, 2048, 1, 3499, 512, 512, 512, 2048] + - [96, 16970.0] + - - [512, 2048, 1, 3501, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3503, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3507, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3508, 512, 512, 512, 2048] + - [118, 16972.0] + - - [512, 2048, 1, 3509, 512, 512, 512, 2048] + - [118, 16978.0] + - - [512, 2048, 1, 3511, 512, 512, 512, 2048] + - [118, 16972.0] + - - [512, 2048, 1, 3514, 512, 512, 512, 2048] + - [96, 16960.0] + - - [512, 2048, 1, 3515, 512, 512, 512, 2048] + - [118, 16974.0] + - - [512, 2048, 1, 3517, 512, 512, 512, 2048] + - [118, 16974.0] + - - [512, 2048, 1, 3518, 512, 512, 512, 2048] + - [96, 16967.0] + - - [512, 2048, 1, 3519, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3520, 512, 512, 512, 2048] + - [96, 17028.0] + - - [512, 2048, 1, 3523, 512, 512, 512, 2048] + - [96, 16958.0] + - - [512, 2048, 1, 3528, 512, 512, 512, 2048] + - [96, 16974.0] + - - [512, 2048, 1, 3529, 512, 512, 512, 2048] + - [118, 16979.0] + - - [512, 2048, 1, 3530, 512, 512, 512, 2048] + - [118, 16978.0] + - - [512, 2048, 1, 3532, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3533, 512, 512, 512, 2048] + - [96, 16975.0] + - - [512, 2048, 1, 3534, 512, 512, 512, 2048] + - [118, 16980.0] + - - [512, 2048, 1, 3538, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3539, 512, 512, 512, 2048] + - [96, 16962.0] + - - [512, 2048, 1, 3541, 512, 512, 512, 2048] + - [96, 16975.0] + - - [512, 2048, 1, 3547, 512, 512, 512, 2048] + - [96, 16951.0] + - - [512, 2048, 1, 3548, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3564, 512, 512, 512, 2048] + - [118, 16973.0] + - - [512, 2048, 1, 3575, 512, 512, 512, 2048] + - [118, 16979.0] + - - [512, 2048, 1, 3598, 512, 512, 512, 2048] + - [96, 16973.0] + - - [512, 2048, 1, 3599, 512, 512, 512, 2048] + - [118, 16990.0] + - - [512, 2048, 1, 3608, 512, 512, 512, 2048] + - [118, 16985.0] + - - [512, 2048, 1, 3780, 512, 512, 512, 2048] + - [74, 16957.0] + - - [512, 2048, 1, 3796, 512, 512, 512, 2048] + - [118, 17007.0] + - - [512, 2048, 1, 3822, 512, 512, 512, 2048] + - [96, 16973.0] + - - [512, 2048, 1, 3859, 512, 512, 512, 2048] + - [118, 16980.0] + - - [512, 2048, 1, 3870, 512, 512, 512, 2048] + - [118, 17006.0] + - - [512, 2048, 1, 3876, 512, 512, 512, 2048] + - [96, 17007.0] + - - [512, 2048, 1, 3906, 512, 512, 512, 2048] + - [118, 17014.0] + - - [512, 2048, 1, 3910, 512, 512, 512, 2048] + - [118, 17023.0] + - - [512, 2048, 1, 3925, 512, 512, 512, 2048] + - [118, 16990.0] + - - [512, 2048, 1, 3942, 512, 512, 512, 2048] + - [118, 17009.0] + - - [512, 2048, 1, 3944, 512, 512, 512, 2048] + - [118, 16985.0] + - - [512, 2048, 1, 3955, 512, 512, 512, 2048] + - [118, 17014.0] + - - [512, 2048, 1, 3968, 512, 512, 512, 2048] + - [96, 17082.0] + - - [512, 2048, 1, 3969, 512, 512, 512, 2048] + - [118, 17008.0] + - - [512, 2048, 1, 3976, 512, 512, 512, 2048] + - [118, 17009.0] + - - [512, 2048, 1, 3977, 512, 512, 512, 2048] + - [118, 17028.0] + - - [512, 2048, 1, 3978, 512, 512, 512, 2048] + - [118, 17017.0] + - - [512, 2048, 1, 3990, 512, 512, 512, 2048] + - [96, 17006.0] + - - [512, 2048, 1, 3995, 512, 512, 512, 2048] + - [118, 17016.0] + - - [512, 2048, 1, 3996, 512, 512, 512, 2048] + - [118, 16996.0] + - - [512, 2048, 1, 3999, 512, 512, 512, 2048] + - [118, 17010.0] + - - [512, 2048, 1, 4005, 512, 512, 512, 2048] + - [118, 17012.0] + - - [512, 2048, 1, 4012, 512, 512, 512, 2048] + - [96, 16996.0] + - - [512, 2048, 1, 4020, 512, 512, 512, 2048] + - [96, 16998.0] + - - [512, 2048, 1, 4026, 512, 512, 512, 2048] + - [96, 17002.0] + - - [512, 2048, 1, 4030, 512, 512, 512, 2048] + - [96, 17017.0] + - - [512, 2048, 1, 4032, 512, 512, 512, 2048] + - [96, 17062.0] + - - [2048, 512, 1, 2790, 2048, 2048, 2048, 512] + - [118, 16852.0] + - - [2048, 512, 1, 2864, 2048, 2048, 2048, 512] + - [118, 16908.0] + - - [2048, 512, 1, 3092, 2048, 2048, 2048, 512] + - [118, 16938.0] + - - [2048, 512, 1, 3113, 2048, 2048, 2048, 512] + - [96, 16915.0] + - - [2048, 512, 1, 3137, 2048, 2048, 2048, 512] + - [96, 16891.0] + - - [2048, 512, 1, 3165, 2048, 2048, 2048, 512] + - [118, 16922.0] + - - [2048, 512, 1, 3166, 2048, 2048, 2048, 512] + - [118, 16939.0] + - - [2048, 512, 1, 3194, 2048, 2048, 2048, 512] + - [96, 16932.0] + - - [2048, 512, 1, 3219, 2048, 2048, 2048, 512] + - [118, 16921.0] + - - [2048, 512, 1, 3222, 2048, 2048, 2048, 512] + - [118, 16915.0] + - - [2048, 512, 1, 3234, 2048, 2048, 2048, 512] + - [118, 16937.0] + - - [2048, 512, 1, 3237, 2048, 2048, 2048, 512] + - [118, 16934.0] + - - [2048, 512, 1, 3242, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3246, 2048, 2048, 2048, 512] + - [118, 16942.0] + - - [2048, 512, 1, 3249, 2048, 2048, 2048, 512] + - [118, 16939.0] + - - [2048, 512, 1, 3251, 2048, 2048, 2048, 512] + - [118, 16941.0] + - - [2048, 512, 1, 3257, 2048, 2048, 2048, 512] + - [118, 16927.0] + - - [2048, 512, 1, 3262, 2048, 2048, 2048, 512] + - [96, 16926.0] + - - [2048, 512, 1, 3268, 2048, 2048, 2048, 512] + - [118, 16931.0] + - - [2048, 512, 1, 3282, 2048, 2048, 2048, 512] + - [118, 16923.0] + - - [2048, 512, 1, 3286, 2048, 2048, 2048, 512] + - [118, 16947.0] + - - [2048, 512, 1, 3287, 2048, 2048, 2048, 512] + - [118, 16934.0] + - - [2048, 512, 1, 3293, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3297, 2048, 2048, 2048, 512] + - [96, 16940.0] + - - [2048, 512, 1, 3307, 2048, 2048, 2048, 512] + - [96, 16924.0] + - - [2048, 512, 1, 3314, 2048, 2048, 2048, 512] + - [96, 16935.0] + - - [2048, 512, 1, 3315, 2048, 2048, 2048, 512] + - [118, 16917.0] + - - [2048, 512, 1, 3319, 2048, 2048, 2048, 512] + - [96, 16932.0] + - - [2048, 512, 1, 3322, 2048, 2048, 2048, 512] + - [96, 16937.0] + - - [2048, 512, 1, 3323, 2048, 2048, 2048, 512] + - [118, 16949.0] + - - [2048, 512, 1, 3324, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3325, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3327, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3329, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3332, 2048, 2048, 2048, 512] + - [96, 16944.0] + - - [2048, 512, 1, 3336, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3339, 2048, 2048, 2048, 512] + - [118, 16947.0] + - - [2048, 512, 1, 3342, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3344, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3358, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3360, 2048, 2048, 2048, 512] + - [96, 17017.0] + - - [2048, 512, 1, 3364, 2048, 2048, 2048, 512] + - [118, 16965.0] + - - [2048, 512, 1, 3365, 2048, 2048, 2048, 512] + - [118, 16967.0] + - - [2048, 512, 1, 3369, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3371, 2048, 2048, 2048, 512] + - [118, 16955.0] + - - [2048, 512, 1, 3374, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3376, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3377, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3378, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3381, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3382, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3383, 2048, 2048, 2048, 512] + - [96, 16958.0] + - - [2048, 512, 1, 3384, 2048, 2048, 2048, 512] + - [118, 16950.0] + - - [2048, 512, 1, 3385, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3386, 2048, 2048, 2048, 512] + - [96, 16949.0] + - - [2048, 512, 1, 3388, 2048, 2048, 2048, 512] + - [118, 16974.0] + - - [2048, 512, 1, 3390, 2048, 2048, 2048, 512] + - [96, 16964.0] + - - [2048, 512, 1, 3391, 2048, 2048, 2048, 512] + - [118, 16951.0] + - - [2048, 512, 1, 3396, 2048, 2048, 2048, 512] + - [118, 16971.0] + - - [2048, 512, 1, 3399, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3402, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3410, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3412, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3414, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3415, 2048, 2048, 2048, 512] + - [118, 16982.0] + - - [2048, 512, 1, 3418, 2048, 2048, 2048, 512] + - [118, 16965.0] + - - [2048, 512, 1, 3420, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3422, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3425, 2048, 2048, 2048, 512] + - [96, 16960.0] + - - [2048, 512, 1, 3426, 2048, 2048, 2048, 512] + - [118, 16961.0] + - - [2048, 512, 1, 3428, 2048, 2048, 2048, 512] + - [96, 16955.0] + - - [2048, 512, 1, 3430, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3431, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3432, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3438, 2048, 2048, 2048, 512] + - [118, 16979.0] + - - [2048, 512, 1, 3439, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3440, 2048, 2048, 2048, 512] + - [96, 16947.0] + - - [2048, 512, 1, 3443, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3445, 2048, 2048, 2048, 512] + - [118, 16955.0] + - - [2048, 512, 1, 3447, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3448, 2048, 2048, 2048, 512] + - [118, 16971.0] + - - [2048, 512, 1, 3450, 2048, 2048, 2048, 512] + - [118, 16960.0] + - - [2048, 512, 1, 3451, 2048, 2048, 2048, 512] + - [96, 16962.0] + - - [2048, 512, 1, 3453, 2048, 2048, 2048, 512] + - [118, 16962.0] + - - [2048, 512, 1, 3455, 2048, 2048, 2048, 512] + - [118, 16973.0] + - - [2048, 512, 1, 3456, 2048, 2048, 2048, 512] + - [96, 17026.0] + - - [2048, 512, 1, 3457, 2048, 2048, 2048, 512] + - [118, 16950.0] + - - [2048, 512, 1, 3458, 2048, 2048, 2048, 512] + - [96, 16933.0] + - - [2048, 512, 1, 3459, 2048, 2048, 2048, 512] + - [118, 16949.0] + - - [2048, 512, 1, 3460, 2048, 2048, 2048, 512] + - [96, 16952.0] + - - [2048, 512, 1, 3461, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3462, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3466, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3467, 2048, 2048, 2048, 512] + - [118, 16958.0] + - - [2048, 512, 1, 3468, 2048, 2048, 2048, 512] + - [96, 16955.0] + - - [2048, 512, 1, 3470, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3471, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3476, 2048, 2048, 2048, 512] + - [118, 16961.0] + - - [2048, 512, 1, 3477, 2048, 2048, 2048, 512] + - [118, 16964.0] + - - [2048, 512, 1, 3478, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3479, 2048, 2048, 2048, 512] + - [118, 16964.0] + - - [2048, 512, 1, 3480, 2048, 2048, 2048, 512] + - [96, 16946.0] + - - [2048, 512, 1, 3481, 2048, 2048, 2048, 512] + - [96, 16950.0] + - - [2048, 512, 1, 3483, 2048, 2048, 2048, 512] + - [96, 16962.0] + - - [2048, 512, 1, 3484, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3487, 2048, 2048, 2048, 512] + - [118, 16973.0] + - - [2048, 512, 1, 3489, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3490, 2048, 2048, 2048, 512] + - [118, 16967.0] + - - [2048, 512, 1, 3491, 2048, 2048, 2048, 512] + - [118, 16958.0] + - - [2048, 512, 1, 3493, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3494, 2048, 2048, 2048, 512] + - [118, 16940.0] + - - [2048, 512, 1, 3495, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3497, 2048, 2048, 2048, 512] + - [96, 16984.0] + - - [2048, 512, 1, 3498, 2048, 2048, 2048, 512] + - [96, 16956.0] + - - [2048, 512, 1, 3499, 2048, 2048, 2048, 512] + - [96, 16975.0] + - - [2048, 512, 1, 3501, 2048, 2048, 2048, 512] + - [96, 16967.0] + - - [2048, 512, 1, 3503, 2048, 2048, 2048, 512] + - [118, 16960.0] + - - [2048, 512, 1, 3507, 2048, 2048, 2048, 512] + - [96, 16956.0] + - - [2048, 512, 1, 3508, 2048, 2048, 2048, 512] + - [96, 16972.0] + - - [2048, 512, 1, 3509, 2048, 2048, 2048, 512] + - [96, 16965.0] + - - [2048, 512, 1, 3511, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3514, 2048, 2048, 2048, 512] + - [96, 16966.0] + - - [2048, 512, 1, 3515, 2048, 2048, 2048, 512] + - [96, 16969.0] + - - [2048, 512, 1, 3517, 2048, 2048, 2048, 512] + - [118, 16977.0] + - - [2048, 512, 1, 3518, 2048, 2048, 2048, 512] + - [118, 16976.0] + - - [2048, 512, 1, 3519, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3520, 2048, 2048, 2048, 512] + - [96, 17044.0] + - - [2048, 512, 1, 3523, 2048, 2048, 2048, 512] + - [96, 16972.0] + - - [2048, 512, 1, 3528, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3529, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3530, 2048, 2048, 2048, 512] + - [96, 16979.0] + - - [2048, 512, 1, 3532, 2048, 2048, 2048, 512] + - [96, 16969.0] + - - [2048, 512, 1, 3533, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3534, 2048, 2048, 2048, 512] + - [118, 16975.0] + - - [2048, 512, 1, 3538, 2048, 2048, 2048, 512] + - [118, 16983.0] + - - [2048, 512, 1, 3539, 2048, 2048, 2048, 512] + - [118, 16982.0] + - - [2048, 512, 1, 3541, 2048, 2048, 2048, 512] + - [118, 16999.0] + - - [2048, 512, 1, 3547, 2048, 2048, 2048, 512] + - [118, 16997.0] + - - [2048, 512, 1, 3548, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3552, 2048, 2048, 2048, 512] + - [96, 17040.0] + - - [2048, 512, 1, 3564, 2048, 2048, 2048, 512] + - [118, 16999.0] + - - [2048, 512, 1, 3575, 2048, 2048, 2048, 512] + - [118, 16976.0] + - - [2048, 512, 1, 3598, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3599, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3608, 2048, 2048, 2048, 512] + - [118, 16991.0] + - - [2048, 512, 1, 3780, 2048, 2048, 2048, 512] + - [96, 16997.0] + - - [2048, 512, 1, 3796, 2048, 2048, 2048, 512] + - [96, 17003.0] + - - [2048, 512, 1, 3822, 2048, 2048, 2048, 512] + - [118, 16983.0] + - - [2048, 512, 1, 3840, 2048, 2048, 2048, 512] + - [96, 17053.0] + - - [2048, 512, 1, 3859, 2048, 2048, 2048, 512] + - [96, 17019.0] + - - [2048, 512, 1, 3870, 2048, 2048, 2048, 512] + - [96, 17003.0] + - - [2048, 512, 1, 3876, 2048, 2048, 2048, 512] + - [118, 16994.0] + - - [2048, 512, 1, 3906, 2048, 2048, 2048, 512] + - [96, 16998.0] + - - [2048, 512, 1, 3910, 2048, 2048, 2048, 512] + - [118, 17025.0] + - - [2048, 512, 1, 3925, 2048, 2048, 2048, 512] + - [118, 17014.0] + - - [2048, 512, 1, 3942, 2048, 2048, 2048, 512] + - [118, 16998.0] + - - [2048, 512, 1, 3944, 2048, 2048, 2048, 512] + - [118, 17015.0] + - - [2048, 512, 1, 3955, 2048, 2048, 2048, 512] + - [118, 17010.0] + - - [2048, 512, 1, 3968, 2048, 2048, 2048, 512] + - [96, 17093.0] + - - [2048, 512, 1, 3969, 2048, 2048, 2048, 512] + - [118, 17031.0] + - - [2048, 512, 1, 3976, 2048, 2048, 2048, 512] + - [118, 17006.0] + - - [2048, 512, 1, 3977, 2048, 2048, 2048, 512] + - [118, 17010.0] + - - [2048, 512, 1, 3978, 2048, 2048, 2048, 512] + - [118, 17023.0] + - - [2048, 512, 1, 3990, 2048, 2048, 2048, 512] + - [118, 17006.0] + - - [2048, 512, 1, 3995, 2048, 2048, 2048, 512] + - [118, 17019.0] + - - [2048, 512, 1, 3996, 2048, 2048, 2048, 512] + - [118, 17012.0] + - - [2048, 512, 1, 3999, 2048, 2048, 2048, 512] + - [118, 17017.0] + - - [2048, 512, 1, 4005, 2048, 2048, 2048, 512] + - [118, 16997.0] + - - [2048, 512, 1, 4012, 2048, 2048, 2048, 512] + - [118, 17026.0] + - - [2048, 512, 1, 4020, 2048, 2048, 2048, 512] + - [118, 17022.0] + - - [2048, 512, 1, 4026, 2048, 2048, 2048, 512] + - [118, 17016.0] + - - [2048, 512, 1, 4030, 2048, 2048, 2048, 512] + - [96, 17016.0] + - - [2048, 512, 1, 4032, 2048, 2048, 2048, 512] + - [96, 17093.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [109, 10980.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [61, 12770.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [109, 17200.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [127, 16972.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [96, 17076.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [96, 17168.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [96, 17193.0] + - - [768, 768, 1, 384, 768, 768, 768, 768] + - [63, 14050.0] + - - [768, 384, 1, 384, 768, 768, 768, 384] + - [120, 10447.0] + - - [1152, 576, 1, 384, 1152, 1152, 1152, 576] + - [110, 12157.0] + - - [384, 768, 1, 384, 384, 384, 384, 768] + - [62, 10584.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [61, 6792.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [61, 16756.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [86, 16328.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [96, 16677.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [63, 13959.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [65, 10012.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [88, 14765.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [61, 15021.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [61, 10699.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [93, 16363.0] + - - [2304, 128, 1, 128, 2304, 2304, 2304, 128] + - [62, 7204.0] + - - [2688, 128, 1, 128, 2688, 2688, 2688, 128] + - [62, 7414.0] + - - [3072, 128, 1, 128, 3072, 3072, 3072, 128] + - [60, 7914.0] + - - [3456, 128, 1, 128, 3456, 3456, 3456, 128] + - [73, 8043.0] + - - [3840, 128, 1, 128, 3840, 3840, 3840, 128] + - [63, 8738.0] + - - [4224, 128, 1, 128, 4224, 4224, 4224, 128] + - [63, 9585.0] + - - [4608, 128, 1, 128, 4608, 4608, 4608, 128] + - [61, 10371.0] + - - [4992, 128, 1, 128, 4992, 4992, 4992, 128] + - [62, 9577.0] + - - [5376, 128, 1, 128, 5376, 5376, 5376, 128] + - [63, 10194.0] + - - [5760, 128, 1, 128, 5760, 5760, 5760, 128] + - [63, 10822.0] + - - [6144, 128, 1, 128, 6144, 6144, 6144, 128] + - [61, 11362.0] + - - [6528, 128, 1, 128, 6528, 6528, 6528, 128] + - [63, 12099.0] + - - [6912, 128, 1, 128, 6912, 6912, 6912, 128] + - [68, 10995.0] + - - [7296, 128, 1, 128, 7296, 7296, 7296, 128] + - [63, 11628.0] + - - [7680, 128, 1, 128, 7680, 7680, 7680, 128] + - [61, 11984.0] + - - [8064, 128, 1, 128, 8064, 8064, 8064, 128] + - [87, 12607.0] + - - [8448, 128, 1, 128, 8448, 8448, 8448, 128] + - [63, 13009.0] + - - [8832, 128, 1, 128, 8832, 8832, 8832, 128] + - [63, 13600.0] + - - [2304, 128, 1, 256, 2304, 2304, 2304, 128] + - [62, 9207.0] + - - [2688, 128, 1, 256, 2688, 2688, 2688, 128] + - [60, 8635.0] + - - [3072, 128, 1, 256, 3072, 3072, 3072, 128] + - [62, 9478.0] + - - [3456, 128, 1, 256, 3456, 3456, 3456, 128] + - [63, 9882.0] + - - [3840, 128, 1, 256, 3840, 3840, 3840, 128] + - [63, 10942.0] + - - [4224, 128, 1, 256, 4224, 4224, 4224, 128] + - [61, 11912.0] + - - [4608, 128, 1, 256, 4608, 4608, 4608, 128] + - [63, 12862.0] + - - [4992, 128, 1, 256, 4992, 4992, 4992, 128] + - [87, 11113.0] + - - [5376, 128, 1, 256, 5376, 5376, 5376, 128] + - [63, 11791.0] + - - [5760, 128, 1, 256, 5760, 5760, 5760, 128] + - [87, 12600.0] + - - [6144, 128, 1, 256, 6144, 6144, 6144, 128] + - [63, 13263.0] + - - [6528, 128, 1, 256, 6528, 6528, 6528, 128] + - [63, 14185.0] + - - [6912, 128, 1, 256, 6912, 6912, 6912, 128] + - [87, 12404.0] + - - [7296, 128, 1, 256, 7296, 7296, 7296, 128] + - [78, 12993.0] + - - [7680, 128, 1, 256, 7680, 7680, 7680, 128] + - [63, 13545.0] + - - [8064, 128, 1, 256, 8064, 8064, 8064, 128] + - [78, 14314.0] + - - [8448, 128, 1, 256, 8448, 8448, 8448, 128] + - [63, 14819.0] + - - [8832, 128, 1, 256, 8832, 8832, 8832, 128] + - [88, 15610.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [63, 15397.0] + - - [384, 1536, 1, 384, 384, 384, 384, 1536] + - [63, 13981.0] + - - [384, 1920, 1, 384, 384, 384, 384, 1920] + - [63, 13495.0] + - - [384, 2304, 1, 384, 384, 384, 384, 2304] + - [88, 13057.0] + - - [64, 192, 64, 1280, 64, 64, 64, 192] + - [109, 14917.0] + - - [64, 320, 64, 1280, 64, 64, 64, 320] + - [101, 15333.0] + - - [64, 384, 64, 1280, 64, 64, 64, 384] + - [109, 15585.0] + - - [64, 448, 64, 1280, 64, 64, 64, 448] + - [119, 14815.0] + - - [64, 192, 64, 2048, 64, 64, 64, 192] + - [113, 14787.0] + - - [64, 320, 64, 2048, 64, 64, 64, 320] + - [76, 12161.0] + - - [64, 384, 64, 2048, 64, 64, 64, 384] + - [76, 12047.0] + - - [64, 448, 64, 2048, 64, 64, 64, 448] + - [76, 12114.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 64] + - [110, 17339.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 64] + - [63, 17910.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 64] + - [110, 18027.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 80] + - [59, 8145.0] + - - [64, 192, 32, 1280, 64, 64, 64, 192] + - [62, 10985.0] + - - [64, 320, 32, 1280, 64, 64, 64, 320] + - [109, 12675.0] + - - [64, 384, 32, 1280, 64, 64, 64, 384] + - [86, 14975.0] + - - [64, 448, 32, 1280, 64, 64, 64, 448] + - [66, 13771.0] + - - [64, 192, 32, 2048, 64, 64, 64, 192] + - [94, 11567.0] + - - [64, 320, 32, 2048, 64, 64, 64, 320] + - [109, 12839.0] + - - [64, 384, 32, 2048, 64, 64, 64, 384] + - [86, 15163.0] + - - [64, 448, 32, 2048, 64, 64, 64, 448] + - [113, 13692.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 64] + - [110, 15887.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 64] + - [110, 16919.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 64] + - [110, 17049.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 80] + - [63, 11046.0] + - - [289, 128, 32, 768, 289, 289, 289, 128] + - [109, 14161.0] + - - [289, 160, 32, 768, 289, 289, 289, 160] + - [86, 13023.0] + - - [289, 192, 32, 768, 289, 289, 289, 192] + - [109, 15563.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [86, 16170.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [98, 18441.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [116, 13149.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [74, 17165.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [88, 15001.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [93, 13915.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [93, 15208.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [116, 14171.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [61, 13275.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [61, 9216.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [61, 8732.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [61, 10625.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [63, 17946.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [86, 11278.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [86, 13943.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [64, 12394.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [64, 15110.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [111, 15092.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [61, 10513.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [61, 13190.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [66, 2834.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [61, 16829.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [61, 7002.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [88, 16157.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [96, 17142.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [88, 14924.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [76, 16998.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [61, 8004.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [88, 16251.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [74, 17133.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [109, 16934.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [61, 7362.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [88, 16156.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [96, 17151.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [86, 17336.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [107, 1628.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [88, 15327.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [96, 16919.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [100, 15418.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [61, 12224.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [107, 2913.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [61, 13508.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [61, 13222.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [108, 2636.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [63, 13657.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [93, 16480.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [106, 17163.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [96, 17190.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [106, 17153.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [107, 326.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [92, 4934.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [123, 10721.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [83, 10725.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [86, 13839.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [119, 14045.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [110, 15336.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [86, 15420.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [87, 14402.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [63, 14397.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [71, 14907.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [71, 14945.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [105, 12482.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [68, 12369.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [123, 16423.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [61, 16217.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [99, 12387.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [86, 12248.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [69, 12296.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [68, 12218.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [108, 8582.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [61, 17562.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [110, 18028.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [127, 16813.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [88, 16294.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [88, 14935.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [63, 13870.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [107, 301.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [107, 362.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [63, 13741.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [63, 12677.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [59, 9582.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [76, 8096.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [88, 8994.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [119, 7798.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [79, 7460.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [107, 2926.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [127, 17008.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [102, 10793.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [81, 10752.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [122, 11073.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [61, 17581.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [110, 8825.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [86, 10906.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [59, 6781.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [95, 6451.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [121, 6671.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [124, 6537.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [61, 10251.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [68, 6065.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [68, 6022.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [68, 6083.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [68, 6053.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [61, 12840.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [61, 14000.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [63, 16332.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [109, 17971.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [110, 18595.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [88, 16740.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [86, 15595.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [87, 16216.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [61, 14847.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [110, 14749.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [86, 16636.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [99, 16556.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [127, 16898.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [63, 15747.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [87, 14554.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [63, 13746.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [87, 13498.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [87, 14828.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [88, 16372.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [63, 15259.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [71, 14096.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [125, 14394.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [61, 15804.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [93, 13833.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [71, 13673.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [67, 531.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [95, 1779.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [70, 499.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [59, 1826.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [107, 1843.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [75, 592.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [82, 1933.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [59, 2042.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [107, 557.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [107, 517.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [60, 9560.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [116, 15091.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [93, 15056.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [125, 14644.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [71, 14444.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [116, 14925.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [71, 14841.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [63, 14737.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [87, 14728.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [87, 13945.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [87, 13928.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [116, 14488.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [71, 14336.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [71, 12850.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [63, 12616.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [87, 13634.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [63, 13764.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [72, 10741.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [115, 10797.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [110, 18138.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [126, 12573.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [61, 12475.0] + - - [64, 64, 64, 13216, 64, 64, 64, 64] + - [80, 6971.0] + - - [64, 96, 36, 10368, 64, 64, 64, 96] + - [80, 8398.0] + - - [64, 64, 36, 12544, 64, 64, 64, 64] + - [132, 7460.0] + - - [64, 64, 36, 11552, 64, 64, 64, 64] + - [130, 7811.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [128, 15060.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [128, 15329.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [128, 15287.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [128, 15552.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [128, 15605.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [128, 15678.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [128, 15686.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [128, 15654.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [128, 15732.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [128, 15785.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [128, 15799.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [128, 15846.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [129, 15857.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [128, 15875.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [128, 14872.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [128, 14974.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [128, 15138.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [128, 15192.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [128, 14272.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [131, 8465.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [161, 6139.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [136, 3663.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [147, 6759.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 64] + - [135, 3068.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [165, 4917.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [136, 6200.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [165, 6481.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [192, 4946.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [169, 9514.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [136, 7303.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [147, 8018.0] + - - [704, 256, 1, 128, 704, 704, 704, 256] + - [184, 4846.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [184, 4887.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [169, 9133.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [136, 6918.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [136, 6638.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [165, 7712.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [165, 6475.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [186, 7005.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [165, 7097.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [192, 6758.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [152, 3866.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [165, 4946.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [169, 8308.0] + - - [64, 2944, 1, 128, 64, 64, 64, 2944] + - [136, 5067.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [169, 8798.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [134, 7634.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [169, 7977.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [201, 4449.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [184, 5430.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [192, 5330.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [182, 5976.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 64] + - [186, 6184.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [165, 6746.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [134, 6554.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [151, 5105.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [136, 4806.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 256] + - [136, 5887.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 64] + - [136, 5004.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [173, 4485.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [168, 3721.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [165, 6268.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [196, 7973.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [138, 6194.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 64] + - [136, 5187.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [147, 8743.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [169, 7698.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [196, 8291.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [156, 8270.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [207, 9532.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [136, 4459.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [136, 5930.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 64] + - [136, 4127.0] + - - [64, 1408, 1, 128, 64, 64, 64, 1408] + - [135, 3084.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [134, 7999.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [136, 6216.0] + - - [448, 256, 1, 128, 448, 448, 448, 256] + - [152, 3529.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [192, 5325.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [135, 3051.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [165, 6247.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [182, 7705.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [147, 6883.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 64] + - [152, 3503.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [135, 3068.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [182, 8027.0] + - - [256, 448, 1, 128, 256, 256, 256, 448] + - [168, 3291.0] + - - [64, 3584, 1, 128, 64, 64, 64, 3584] + - [186, 5224.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [202, 4863.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [169, 7800.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [134, 7984.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [135, 3757.0] + - - [64, 1856, 1, 128, 64, 64, 64, 1856] + - [151, 3440.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [151, 5187.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [192, 7688.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [138, 5489.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [196, 6484.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [165, 7133.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [169, 9167.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [165, 5310.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [147, 7790.0] + - - [256, 704, 1, 128, 256, 256, 256, 704] + - [160, 4727.0] + - - [256, 1024, 1, 128, 256, 256, 256, 1024] + - [161, 5971.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [192, 4938.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [147, 8319.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [142, 4964.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [169, 8075.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [134, 5946.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [147, 9120.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [165, 6987.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [173, 4660.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [141, 3757.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [147, 7739.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [136, 6079.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [165, 6987.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [169, 7932.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [165, 5323.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [186, 6388.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [186, 7349.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [182, 7690.0] + - - [64, 2368, 1, 128, 64, 64, 64, 2368] + - [173, 4254.0] + - - [64, 4288, 1, 128, 64, 64, 64, 4288] + - [186, 6228.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [138, 6540.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [189, 3721.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [169, 8279.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [182, 7690.0] + - - [448, 448, 1, 128, 448, 448, 448, 448] + - [184, 5375.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [182, 6054.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [149, 8272.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [190, 7874.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [142, 7889.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [149, 8387.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [160, 5802.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [144, 8951.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [190, 6730.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [177, 8294.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [165, 7470.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [177, 7055.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [184, 5361.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [144, 6388.0] + - - [64, 64, 36, 3136, 64, 64, 64, 64] + - [192, 8500.0] + - - [64, 64, 64, 826, 64, 64, 64, 64] + - [147, 8290.0] + - - [64, 64, 64, 1600, 64, 64, 64, 64] + - [147, 8804.0] + - - [64, 96, 64, 288, 64, 64, 64, 96] + - [190, 7832.0] + - - [96, 96, 36, 1568, 96, 96, 96, 96] + - [162, 8242.0] + - - [96, 96, 36, 2592, 96, 96, 96, 96] + - [188, 8323.0] + - - [64, 96, 64, 800, 64, 64, 64, 96] + - [196, 8456.0] + - - [35, 96, 36, 8960, 35, 35, 35, 96] + - [194, 4414.0] + - - [32, 64, 36, 43808, 32, 32, 32, 64] + - [149, 4524.0] + - - [64, 64, 64, 81, 64, 64, 64, 64] + - [138, 3962.0] + - - [64, 96, 36, 512, 64, 64, 64, 96] + - [140, 6456.0] + - - [64, 64, 64, 3200, 64, 64, 64, 64] + - [147, 9110.0] + - - [64, 64, 36, 3520, 64, 64, 64, 64] + - [192, 8642.0] + - - [64, 64, 64, 5408, 64, 64, 64, 64] + - [208, 8477.0] + - - [35, 96, 36, 13440, 35, 35, 35, 96] + - [194, 4317.0] + - - [96, 96, 64, 1152, 96, 96, 96, 96] + - [140, 9033.0] + - - [32, 64, 36, 90, 32, 32, 32, 64] + - [185, 2100.0] + - - [64, 64, 64, 800, 64, 64, 64, 64] + - [169, 8283.0] + - - [64, 64, 36, 1568, 64, 64, 64, 64] + - [165, 8147.0] + - - [64, 64, 36, 196, 64, 64, 64, 64] + - [160, 4722.0] + - - [35, 96, 64, 4235, 35, 35, 35, 96] + - [207, 4842.0] + - - [149, 32, 36, 19072, 149, 149, 149, 32] + - [175, 5441.0] + - - [64, 96, 36, 1568, 64, 64, 64, 96] + - [207, 7160.0] + - - [96, 96, 64, 800, 96, 96, 96, 96] + - [140, 9099.0] + - - [32, 64, 64, 640, 32, 32, 32, 64] + - [140, 6096.0] + - - [64, 64, 36, 392, 64, 64, 64, 64] + - [144, 6046.0] + - - [64, 64, 64, 1652, 64, 64, 64, 64] + - [147, 8820.0] + - - [64, 96, 36, 2592, 64, 64, 64, 96] + - [167, 7897.0] + - - [64, 64, 36, 6272, 64, 64, 64, 64] + - [149, 8850.0] + - - [32, 64, 64, 20000, 32, 32, 32, 64] + - [175, 4464.0] + - - [64, 64, 64, 648, 64, 64, 64, 64] + - [138, 7490.0] + - - [32, 64, 36, 1440, 32, 32, 32, 64] + - [143, 4685.0] + - - [64, 64, 64, 100, 64, 64, 64, 64] + - [138, 5328.0] + - - [64, 96, 64, 4608, 64, 64, 64, 96] + - [154, 8461.0] + - - [64, 64, 64, 200, 64, 64, 64, 64] + - [138, 5958.0] + - - [32, 64, 64, 40, 32, 32, 32, 64] + - [137, 2149.0] + - - [64, 96, 64, 1152, 64, 64, 64, 96] + - [169, 8665.0] + - - [149, 32, 64, 8195, 149, 149, 149, 32] + - [134, 5538.0] + - - [35, 96, 64, 6160, 35, 35, 35, 96] + - [147, 4704.0] + - - [64, 64, 36, 1760, 64, 64, 64, 64] + - [165, 7789.0] + - - [64, 2880, 1, 320, 64, 64, 64, 2880] + - [136, 6397.0] + - - [49, 832, 32, 256, 49, 49, 49, 832] + - [142, 7752.0] + - - [289, 1120, 1, 160, 289, 289, 289, 1120] + - [142, 6378.0] + - - [64, 1728, 1, 320, 64, 64, 64, 1728] + - [142, 4560.0] + - - [49, 832, 32, 160, 49, 49, 49, 832] + - [176, 7579.0] + - - [49, 832, 32, 384, 49, 49, 49, 832] + - [142, 7917.0] + - - [289, 896, 1, 192, 289, 289, 289, 896] + - [138, 6309.0] + - - [289, 896, 1, 128, 289, 289, 289, 896] + - [136, 5543.0] + - - [196, 800, 1, 64, 196, 196, 196, 800] + - [148, 2803.0] + - - [64, 1344, 1, 512, 64, 64, 64, 1344] + - [146, 4124.0] + - - [64, 1152, 1, 384, 64, 64, 64, 1152] + - [143, 3965.0] + - - [64, 1152, 1, 448, 64, 64, 64, 1152] + - [143, 4078.0] + - - [49, 832, 32, 128, 49, 49, 49, 832] + - [202, 7428.0] + - - [49, 832, 32, 48, 49, 49, 49, 832] + - [184, 6275.0] + - - [64, 1152, 1, 256, 64, 64, 64, 1152] + - [164, 3616.0] + - - [49, 832, 32, 32, 49, 49, 49, 832] + - [136, 5479.0] + - - [289, 1120, 1, 192, 289, 289, 289, 1120] + - [163, 6668.0] + - - [196, 600, 1, 64, 196, 196, 196, 600] + - [133, 2382.0] + - - [49, 832, 32, 192, 49, 49, 49, 832] + - [163, 7623.0] + - - [64, 1728, 1, 192, 64, 64, 64, 1728] + - [168, 3918.0] + - - [64, 38, 840, 38, 64, 64, 64, 38] + - [184, 5452.0] + - - [64, 49, 648, 49, 64, 64, 64, 49] + - [160, 6963.0] + - - [64, 32, 992, 32, 64, 64, 64, 32] + - [156, 6042.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [136, 4998.0] + - - [64, 41, 776, 41, 64, 64, 64, 41] + - [165, 5814.0] + - - [64, 45, 712, 45, 64, 64, 64, 45] + - [144, 6489.0] + - - [64, 54, 592, 54, 64, 64, 64, 54] + - [160, 7672.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [184, 8312.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [198, 7992.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [190, 7868.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [160, 4000.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [153, 1884.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [182, 4997.0] + - - [512, 512, 1, 3780, 512, 512, 512, 512] + - [147, 9134.0] + - - [512, 512, 1, 3796, 512, 512, 512, 512] + - [169, 9143.0] + - - [512, 512, 1, 3822, 512, 512, 512, 512] + - [196, 9145.0] + - - [512, 512, 1, 3840, 512, 512, 512, 512] + - [169, 9175.0] + - - [512, 512, 1, 3859, 512, 512, 512, 512] + - [196, 9148.0] + - - [512, 512, 1, 3870, 512, 512, 512, 512] + - [169, 9146.0] + - - [512, 512, 1, 3876, 512, 512, 512, 512] + - [196, 9147.0] + - - [512, 512, 1, 3906, 512, 512, 512, 512] + - [169, 9146.0] + - - [512, 512, 1, 3910, 512, 512, 512, 512] + - [147, 9148.0] + - - [512, 512, 1, 3925, 512, 512, 512, 512] + - [169, 9138.0] + - - [512, 512, 1, 3927, 512, 512, 512, 512] + - [147, 9134.0] + - - [512, 512, 1, 3942, 512, 512, 512, 512] + - [207, 9147.0] + - - [512, 512, 1, 3944, 512, 512, 512, 512] + - [147, 9158.0] + - - [512, 512, 1, 3955, 512, 512, 512, 512] + - [169, 9154.0] + - - [512, 512, 1, 3968, 512, 512, 512, 512] + - [169, 9155.0] + - - [512, 512, 1, 3969, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 3976, 512, 512, 512, 512] + - [169, 9156.0] + - - [512, 512, 1, 3977, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 3978, 512, 512, 512, 512] + - [196, 9154.0] + - - [512, 512, 1, 3990, 512, 512, 512, 512] + - [169, 9157.0] + - - [512, 512, 1, 3995, 512, 512, 512, 512] + - [180, 9166.0] + - - [512, 512, 1, 3996, 512, 512, 512, 512] + - [147, 9168.0] + - - [512, 512, 1, 3999, 512, 512, 512, 512] + - [147, 9183.0] + - - [512, 512, 1, 4005, 512, 512, 512, 512] + - [196, 9185.0] + - - [512, 512, 1, 4012, 512, 512, 512, 512] + - [147, 9153.0] + - - [512, 512, 1, 4020, 512, 512, 512, 512] + - [147, 9146.0] + - - [512, 512, 1, 4026, 512, 512, 512, 512] + - [196, 9157.0] + - - [512, 512, 1, 4030, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 4032, 512, 512, 512, 512] + - [169, 9196.0] + - - [512, 512, 1, 4050, 512, 512, 512, 512] + - [156, 9175.0] + - - [512, 512, 1, 4059, 512, 512, 512, 512] + - [147, 9187.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [201, 6348.0] + - - [384, 192, 1, 384, 384, 384, 384, 192] + - [143, 3965.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [147, 8637.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [169, 8788.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [156, 9044.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [196, 9087.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [147, 9124.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [156, 9156.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [169, 9171.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [147, 9217.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [147, 9206.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [169, 9206.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [147, 9221.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [169, 9215.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [156, 9225.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [169, 9240.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [196, 9246.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [169, 9265.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [147, 9272.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [196, 8920.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [169, 6227.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [182, 5503.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [167, 5924.0] + - - [768, 128, 1, 128, 768, 768, 768, 128] + - [143, 3146.0] + - - [1152, 128, 1, 128, 1152, 1152, 1152, 128] + - [152, 4369.0] + - - [1536, 128, 1, 128, 1536, 1536, 1536, 128] + - [173, 5115.0] + - - [1920, 128, 1, 128, 1920, 1920, 1920, 128] + - [136, 5617.0] + - - [768, 128, 1, 256, 768, 768, 768, 128] + - [152, 3884.0] + - - [1152, 128, 1, 256, 1152, 1152, 1152, 128] + - [201, 5825.0] + - - [1536, 128, 1, 256, 1536, 1536, 1536, 128] + - [136, 6420.0] + - - [1920, 128, 1, 256, 1920, 1920, 1920, 128] + - [138, 6665.0] + - - [448, 448, 1, 448, 448, 448, 448, 448] + - [160, 7322.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 32] + - [142, 9875.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 48] + - [171, 7706.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 48] + - [149, 7780.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 48] + - [149, 7777.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 32] + - [163, 9299.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 48] + - [149, 7517.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 48] + - [149, 7617.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 48] + - [149, 7617.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [198, 8207.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [163, 7562.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [165, 7577.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [198, 7918.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [147, 6915.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [147, 7419.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [147, 7499.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [147, 7980.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [206, 7460.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [158, 7982.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [144, 5436.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [144, 5401.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [165, 5561.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [156, 7618.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [184, 8253.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [160, 5499.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [160, 5454.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [173, 5631.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [159, 127.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [204, 7662.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [163, 6749.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [149, 7831.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [165, 8006.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [171, 7832.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [190, 7885.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [138, 6242.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [190, 9157.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [135, 730.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [141, 211.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [141, 638.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [141, 782.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [141, 178.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [141, 175.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [141, 207.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [141, 722.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [141, 621.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [141, 229.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [141, 210.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [141, 702.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [141, 192.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [141, 729.0] + - - [32, 32, 36, 43808, 32, 32, 32, 32] + - [179, 3472.0] + - - [32, 32, 64, 20000, 32, 32, 32, 32] + - [191, 3461.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [221, 7314.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 4] + - [236, 1199.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 4] + - [189, 636.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 4] + - [230, 803.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 4] + - [232, 1892.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 4] + - [230, 661.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 4] + - [189, 457.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 4] + - [227, 530.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 4] + - [228, 861.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 4] + - [230, 1327.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 4] + - [233, 810.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 4] + - [189, 518.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 4] + - [189, 319.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 4] + - [229, 1510.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 4] + - [230, 576.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 4] + - [227, 914.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 4] + - [237, 1862.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 4] + - [233, 711.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 4] + - [231, 1727.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 4] + - [230, 1091.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 4] + - [189, 365.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 4] + - [189, 290.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 4] + - [230, 503.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 4] + - [233, 1249.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 4] + - [233, 617.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 4] + - [230, 758.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 4] + - [233, 740.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 4] + - [230, 880.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 4] + - [235, 2092.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 4] + - [236, 1424.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 4] + - [189, 419.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 4] + - [189, 221.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 4] + - [238, 977.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 4] + - [230, 1580.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 4] + - [230, 996.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 4] + - [234, 1108.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 4] + - [235, 1671.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 1] + - [141, 145.0] + - - [2048, 1, 1, 960, 2048, 2048, 2048, 1] + - [146, 172.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [226, 5.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [226, 13.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [141, 21.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [226, 7.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 1856] + - [241, 745.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 2944] + - [249, 1006.0] + - - [4, 1408, 1, 128, 4, 4, 4, 1408] + - [189, 218.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 2368] + - [241, 814.0] + - - [4, 3584, 1, 128, 4, 4, 4, 3584] + - [187, 515.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 5888] + - [243, 1572.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 1408] + - [241, 566.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 6784] + - [249, 1579.0] + - - [4, 4288, 1, 128, 4, 4, 4, 4288] + - [187, 606.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 5056] + - [249, 1455.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 6784] + - [242, 1493.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 2944] + - [241, 1111.0] + - - [4, 5056, 1, 256, 4, 4, 4, 5056] + - [245, 973.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 5056] + - [244, 1355.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 2368] + - [241, 898.0] + - - [4, 1856, 1, 256, 4, 4, 4, 1856] + - [248, 411.0] + - - [4, 2368, 1, 256, 4, 4, 4, 2368] + - [174, 505.0] + - - [4, 2944, 1, 256, 4, 4, 4, 2944] + - [174, 620.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 4288] + - [249, 1402.0] + - - [4, 6784, 1, 128, 4, 4, 4, 6784] + - [247, 895.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 3584] + - [241, 1190.0] + - - [4, 5888, 1, 256, 4, 4, 4, 5888] + - [245, 1096.0] + - - [4, 6784, 1, 256, 4, 4, 4, 6784] + - [246, 1106.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1408] + - [250, 495.0] + - - [4, 3584, 1, 256, 4, 4, 4, 3584] + - [239, 711.0] + - - [4, 1408, 1, 256, 4, 4, 4, 1408] + - [240, 316.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 4288] + - [244, 1562.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 5888] + - [243, 1485.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1856] + - [249, 651.0] + - - [4, 1856, 1, 128, 4, 4, 4, 1856] + - [150, 285.0] + - - [4, 2944, 1, 128, 4, 4, 4, 2944] + - [240, 433.0] + - - [4, 5056, 1, 128, 4, 4, 4, 5056] + - [247, 700.0] + - - [4, 4288, 1, 256, 4, 4, 4, 4288] + - [245, 838.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3584] + - [249, 1314.0] + - - [4, 5888, 1, 128, 4, 4, 4, 5888] + - [247, 806.0] + - - [4, 2368, 1, 128, 4, 4, 4, 2368] + - [240, 352.0] + - - [49, 1200, 1, 128, 49, 49, 49, 1200] + - [135, 2034.0] + - - [1, 1152, 1, 256, 1, 1, 1, 1152] + - [143, 68.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [143, 1676.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [143, 603.0] + - - [16, 32, 36, 5760, 16, 16, 16, 32] + - [210, 3267.0] + - - [3, 64, 36, 6272, 3, 3, 3, 64] + - [222, 746.0] + - - [3, 64, 64, 46208, 3, 3, 3, 64] + - [193, 578.0] + - - [3, 64, 64, 92416, 3, 3, 3, 64] + - [178, 576.0] + - - [1, 16, 36, 23040, 1, 1, 1, 16] + - [214, 201.0] + - - [1, 16, 64, 10240, 1, 1, 1, 16] + - [210, 233.0] + - - [3, 64, 36, 25088, 3, 3, 3, 64] + - [220, 588.0] + - - [3, 64, 64, 11552, 3, 3, 3, 64] + - [219, 654.0] + - - [3, 64, 36, 200704, 3, 3, 3, 64] + - [209, 571.0] + - - [3, 64, 64, 23104, 3, 3, 3, 64] + - [205, 576.0] + - - [3, 64, 36, 100352, 3, 3, 3, 64] + - [216, 572.0] + - - [3, 64, 36, 50176, 3, 3, 3, 64] + - [224, 572.0] + - - [8, 384, 64, 6600, 8, 8, 8, 384] + - [210, 1527.0] + - - [65, 1024, 1, 6400, 65, 65, 65, 1024] + - [211, 4894.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [222, 2367.0] + - - [256, 1, 1, 32768, 256, 256, 256, 1] + - [215, 106.0] + - - [256, 4, 1, 6912, 256, 256, 256, 4] + - [215, 320.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [217, 2452.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [225, 408.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [212, 419.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [212, 422.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [217, 1718.0] + - - [256, 1, 1, 6912, 256, 256, 256, 1] + - [218, 81.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [213, 5685.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [223, 367.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [212, 408.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [143, 3474.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [164, 784.0] + - - [4, 704, 1, 1280, 4, 4, 4, 704] + - [143, 259.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [203, 507.0] + - - [64, 4, 1, 256, 64, 64, 64, 4] + - [164, 15.0] + - - [64, 704, 1, 128, 64, 64, 64, 704] + - [172, 1825.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [148, 2710.0] + - - [128, 4, 1, 1280, 128, 128, 128, 4] + - [143, 47.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [143, 4140.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [166, 3517.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 64] + - [164, 2527.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [164, 4144.0] + - - [4, 704, 1, 256, 4, 4, 4, 704] + - [143, 165.0] + - - [704, 4, 1, 1280, 704, 704, 704, 4] + - [197, 260.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [164, 1748.0] + - - [64, 1024, 1, 128, 64, 64, 64, 1024] + - [164, 2512.0] + - - [4, 64, 1, 1280, 4, 4, 4, 64] + - [143, 24.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [148, 3451.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [148, 2703.0] + - - [448, 4, 1, 256, 448, 448, 448, 4] + - [164, 103.0] + - - [448, 4, 1, 1280, 448, 448, 448, 4] + - [143, 165.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [143, 20.0] + - - [256, 4, 1, 128, 256, 256, 256, 4] + - [143, 41.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [193, 3777.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [143, 504.0] + - - [704, 64, 1, 128, 704, 704, 704, 64] + - [141, 1814.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 4] + - [143, 237.0] + - - [256, 256, 1, 128, 256, 256, 256, 256] + - [164, 2512.0] + - - [64, 256, 1, 128, 64, 64, 64, 256] + - [143, 704.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [166, 3512.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [143, 2834.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [164, 1279.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [148, 3089.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [148, 3020.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [164, 1389.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [148, 869.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [148, 1731.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [164, 1997.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [148, 3022.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [148, 4360.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 4] + - [148, 426.0] + - - [4, 4, 1, 256, 4, 4, 4, 4] + - [133, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [143, 1008.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [148, 3089.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [148, 777.0] + - - [4, 448, 1, 3328, 4, 4, 4, 448] + - [148, 188.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [164, 3214.0] + - - [256, 4, 1, 1280, 256, 256, 256, 4] + - [148, 95.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [141, 2507.0] + - - [4, 704, 1, 128, 4, 4, 4, 704] + - [143, 113.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [164, 619.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [164, 2823.0] + - - [448, 64, 1, 128, 448, 448, 448, 64] + - [164, 1223.0] + - - [4, 448, 1, 1280, 4, 4, 4, 448] + - [143, 165.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [143, 3214.0] + - - [256, 64, 1, 128, 256, 256, 256, 64] + - [164, 699.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 1024] + - [148, 427.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [197, 4357.0] + - - [704, 4, 1, 128, 704, 704, 704, 4] + - [164, 111.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [143, 59.0] + - - [256, 4, 1, 3328, 256, 256, 256, 4] + - [148, 107.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [164, 60.0] + - - [4, 4, 1, 128, 4, 4, 4, 4] + - [133, 1.0] + - - [4, 128, 1, 256, 4, 4, 4, 128] + - [143, 30.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [148, 388.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [148, 3818.0] + - - [4, 448, 1, 128, 4, 4, 4, 448] + - [143, 71.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [148, 1549.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [164, 2534.0] + - - [4, 128, 1, 3328, 4, 4, 4, 128] + - [148, 54.0] + - - [64, 4, 1, 128, 64, 64, 64, 4] + - [141, 10.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [143, 257.0] + - - [4, 704, 1, 3328, 4, 4, 4, 704] + - [148, 295.0] + - - [4, 4, 1, 1280, 4, 4, 4, 4] + - [133, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [141, 733.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 4] + - [143, 163.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [148, 434.0] + - - [4, 64, 1, 128, 4, 4, 4, 64] + - [141, 10.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [148, 777.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [148, 1556.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [164, 1997.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [148, 1551.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 4] + - [197, 377.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [141, 2486.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [139, 3652.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [148, 869.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [164, 1756.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [143, 1226.0] + - - [4, 256, 1, 128, 4, 4, 4, 256] + - [164, 41.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [164, 3226.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [164, 354.0] + - - [4, 4, 1, 3328, 4, 4, 4, 4] + - [143, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1024] + - [148, 377.0] + - - [704, 4, 1, 256, 704, 704, 704, 4] + - [143, 164.0] + - - [128, 4, 1, 3328, 128, 128, 128, 4] + - [148, 54.0] + - - [448, 4, 1, 3328, 448, 448, 448, 4] + - [148, 187.0] + - - [704, 4, 1, 3328, 704, 704, 704, 4] + - [148, 294.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [139, 3637.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [197, 4357.0] + - - [4, 1024, 1, 128, 4, 4, 4, 1024] + - [143, 165.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [148, 1735.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [164, 2198.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [164, 1398.0] + - - [128, 4, 1, 256, 128, 128, 128, 4] + - [143, 30.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [148, 4145.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [148, 3447.0] + - - [448, 4, 1, 128, 448, 448, 448, 4] + - [164, 71.0] + - - [4, 256, 1, 3328, 4, 4, 4, 256] + - [148, 108.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [141, 20.0] + - - [4, 256, 1, 1280, 4, 4, 4, 256] + - [148, 95.0] + - - [64, 4, 1, 3328, 64, 64, 64, 4] + - [148, 27.0] + - - [4, 64, 1, 3328, 4, 4, 4, 64] + - [148, 27.0] + - - [4, 1024, 1, 256, 4, 4, 4, 1024] + - [143, 239.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [164, 1008.0] + - - [4, 64, 1, 256, 4, 4, 4, 64] + - [143, 15.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [135, 2224.0] + - - [64, 448, 1, 128, 64, 64, 64, 448] + - [164, 1232.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [145, 3779.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [148, 3818.0] + - - [4, 448, 1, 256, 4, 4, 4, 448] + - [164, 104.0] + - - [4, 128, 1, 1280, 4, 4, 4, 128] + - [148, 48.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [164, 352.0] + - - [64, 64, 1, 128, 64, 64, 64, 64] + - [164, 177.0] + - - [64, 4, 1, 1280, 64, 64, 64, 4] + - [143, 24.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [148, 1731.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [164, 1013.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [142, 5292.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [136, 4489.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [184, 5855.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [152, 2027.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [136, 1435.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [136, 1834.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [136, 6285.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [135, 4200.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [135, 2688.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [173, 4953.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [135, 3880.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [152, 2369.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [135, 3619.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [135, 3369.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [135, 3024.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [184, 6647.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [160, 4062.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [184, 4355.0] + - - [768, 2, 1, 16, 768, 768, 768, 2] + - [135, 14.0] + - - [768, 2, 1, 32, 768, 768, 768, 2] + - [135, 25.0] + - - [3, 64, 64, 2888, 3, 3, 3, 64] + - [187, 769.0] + - - [1, 16, 64, 640, 1, 1, 1, 16] + - [146, 77.0] + - - [512, 24, 36, 800, 512, 512, 512, 24] + - [169, 7444.0] + - - [16, 32, 36, 360, 16, 16, 16, 32] + - [164, 1211.0] + - - [1, 16, 36, 1440, 1, 1, 1, 16] + - [143, 54.0] + - - [512, 24, 64, 512, 512, 512, 512, 24] + - [190, 7345.0] + - - [3, 64, 36, 3136, 3, 3, 3, 64] + - [145, 633.0] + - - [256, 24, 64, 32, 256, 256, 256, 24] + - [136, 3010.0] + - - [256, 16, 36, 3200, 256, 256, 256, 16] + - [195, 5023.0] + - - [256, 16, 36, 32, 256, 256, 256, 16] + - [183, 1787.0] + - - [512, 24, 36, 288, 512, 512, 512, 24] + - [142, 6894.0] + - - [512, 24, 64, 128, 512, 512, 512, 24] + - [163, 6729.0] + - - [3, 64, 64, 1444, 3, 3, 3, 64] + - [139, 746.0] + - - [16, 32, 64, 160, 16, 16, 16, 32] + - [153, 1417.0] + - - [256, 16, 64, 32, 256, 256, 256, 16] + - [159, 2605.0] + - - [256, 16, 64, 1568, 256, 256, 256, 16] + - [195, 5073.0] + - - [256, 24, 36, 128, 256, 256, 256, 24] + - [160, 4552.0] + - - [16, 32, 64, 2560, 16, 16, 16, 32] + - [145, 2962.0] + - - [49, 800, 1, 128, 49, 49, 49, 800] + - [200, 1520.0] + - - [64, 12, 2520, 12, 64, 64, 64, 12] + - [135, 2657.0] + - - [64, 13, 2336, 13, 64, 64, 64, 13] + - [135, 3001.0] + - - [64, 14, 2184, 14, 64, 64, 64, 14] + - [135, 3231.0] + - - [64, 15, 2048, 15, 64, 64, 64, 15] + - [135, 3470.0] + - - [64, 16, 1920, 16, 64, 64, 64, 16] + - [135, 3827.0] + - - [64, 17, 1816, 17, 64, 64, 64, 17] + - [160, 3434.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [160, 3778.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [201, 3887.0] + - - [64, 21, 1488, 21, 64, 64, 64, 21] + - [136, 4268.0] + - - [64, 23, 1360, 23, 64, 64, 64, 23] + - [136, 4698.0] + - - [64, 25, 1256, 25, 64, 64, 64, 25] + - [160, 5233.0] + - - [64, 27, 1168, 27, 64, 64, 64, 27] + - [173, 5527.0] + - - [64, 29, 1088, 29, 64, 64, 64, 29] + - [136, 5891.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [143, 154.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [170, 212.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [135, 8.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [151, 129.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [133, 891.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 1] + - [141, 51.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [151, 320.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [133, 1.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [170, 203.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [135, 9.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [135, 10.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [148, 29.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [148, 29.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [148, 30.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [139, 9.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [193, 11.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [148, 29.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [148, 30.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [148, 31.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [155, 9.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [135, 10.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [135, 11.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [135, 9.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [141, 9.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [166, 10.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [133, 34.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [141, 6.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [148, 1604.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [164, 4033.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [164, 2558.0] + - - [384, 128, 1, 128, 384, 384, 384, 128] + - [141, 1978.0] + - - [384, 128, 1, 256, 384, 384, 384, 128] + - [141, 2700.0] + - - [64, 64, 1, 64, 64, 64, 64, 64] + - [137, 113.0] + - - [256, 4, 1, 4096, 256, 256, 256, 4] + - [148, 109.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [134, 6326.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [168, 2914.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [168, 2840.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [139, 2350.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [139, 1412.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [133, 1467.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [141, 13.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [143, 189.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [137, 36.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [137, 39.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [185, 40.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [141, 7.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [148, 208.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [141, 11.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [143, 182.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [141, 12.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [143, 185.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [134, 615.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [134, 631.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [136, 630.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [136, 633.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [157, 3800.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [157, 3961.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [157, 4007.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [181, 4101.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [199, 3399.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [134, 3831.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [158, 3928.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [182, 4198.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [182, 4924.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [182, 5766.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [134, 5946.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [158, 6017.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [199, 5953.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [159, 3879.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [183, 4026.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [135, 4105.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [159, 4160.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [142, 4240.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [184, 4367.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [184, 4464.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [184, 4851.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [184, 5375.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [201, 5951.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [173, 6866.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [160, 6894.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [184, 7021.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [164, 2512.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [133, 1.0] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [133, 17.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1] + - [148, 98.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [151, 431.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [133, 2325.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [148, 16.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [137, 52.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [148, 691.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [148, 708.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [143, 574.0] + - - [256, 1, 1, 3456, 256, 256, 256, 1] + - [148, 27.0] + - - [256, 1, 1, 4096, 256, 256, 256, 1] + - [146, 27.0] + - - [256, 1, 1, 864, 256, 256, 256, 1] + - [143, 22.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [148, 3409.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [148, 3492.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [164, 2866.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [135, 60.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [150, 60.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [141, 15.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [173, 6134.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [160, 6593.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [143, 81.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [135, 67.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [148, 152.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [202, 4342.0] + - - [2, 1024, 1, 6, 2, 2, 2, 1024] + - [135, 8.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [148, 22.0] - null -- DeviceEfficiency diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB_GB.yaml new file mode 100644 index 000000000..7fe1d21bb --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bjlk_SB_GB.yaml @@ -0,0 +1,64892 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi31 +- gfx1100 +- [Device 744c] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x16_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 4 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SN_SU32_SUM3_TT1_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [17, 18267.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [5, 17183.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [33, 18648.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [17, 18594.0] + - - [3072, 768, 1, 4096, 3072, 3072, 3072, 768] + - [38, 19943.0] + - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] + - [33, 18813.0] + - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] + - [17, 18839.0] + - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] + - [33, 18466.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [21, 20100.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [18, 19114.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [33, 19146.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [3, 19493.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [9, 17238.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [37, 20476.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [3, 18253.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 5056] + - [33, 18166.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [37, 20233.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [34, 19309.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [20, 18702.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [21, 19356.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [17, 16814.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [20, 18788.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [37, 20128.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [9, 15038.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 1408] + - [35, 15149.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [38, 19306.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [37, 20352.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [0, 16082.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [34, 18858.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [37, 20134.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [5, 17229.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [33, 17843.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [38, 15534.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [3, 19113.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [33, 18773.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [3, 16422.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [33, 17937.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [21, 20715.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [18, 19582.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 6784] + - [33, 18839.0] + - - [704, 5056, 1, 128, 704, 704, 704, 5056] + - [47, 15486.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [3, 18685.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [37, 20786.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [17, 19155.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [21, 20049.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [17, 17574.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [37, 20258.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [37, 19800.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [33, 18143.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 2944] + - [17, 17442.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [1, 19574.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [20, 18355.0] + - - [448, 5888, 1, 128, 448, 448, 448, 5888] + - [33, 14259.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [17, 19271.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [17, 14796.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [1, 20479.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 2944] + - [17, 19280.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [0, 16792.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [36, 18790.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 6784] + - [17, 18365.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [33, 18127.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [33, 17984.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [0, 18929.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [33, 19183.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [15, 16951.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [37, 20838.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [20, 18278.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [33, 14689.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [17, 15416.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [5, 18665.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [20, 19047.0] + - - [704, 2944, 1, 128, 704, 704, 704, 2944] + - [0, 14545.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [33, 18072.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [5, 17377.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [33, 18281.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 1408] + - [0, 18334.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [37, 20147.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [36, 19427.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [1, 19827.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 4288] + - [9, 18130.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [18, 19707.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [37, 20184.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [37, 20613.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [34, 20209.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [37, 19816.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [38, 17542.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [37, 20585.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [33, 19227.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [4, 20162.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 5888] + - [38, 19446.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [34, 17693.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 3584] + - [38, 18344.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [37, 20950.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [38, 19537.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [15, 17674.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 2368] + - [17, 18699.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [33, 17437.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [6, 17821.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [4, 20159.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [1, 19855.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [20, 17576.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [13, 19146.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [17, 17633.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [0, 18208.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [21, 19710.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [17, 18826.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 448] + - [0, 16063.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [34, 18328.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [4, 20880.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [1, 19591.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [17, 15847.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [33, 18006.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [21, 20330.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 5888] + - [5, 19285.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 3584] + - [38, 17923.0] + - - [448, 3584, 1, 128, 448, 448, 448, 3584] + - [24, 12686.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [4, 20393.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 5888] + - [33, 18680.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [1, 20253.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 704] + - [33, 14360.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [37, 20546.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 2368] + - [33, 18959.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 704] + - [9, 16665.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [36, 19134.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [37, 20170.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [4, 20833.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [37, 20248.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [9, 16740.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 5888] + - [33, 18323.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [21, 20585.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [4, 20320.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [33, 19183.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [3, 19054.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [18, 18932.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [22, 19299.0] + - - [256, 5056, 1, 128, 256, 256, 256, 5056] + - [33, 14088.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [5, 18945.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [37, 19368.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [44, 16128.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 1408] + - [9, 18124.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [25, 19649.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [0, 16394.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [17, 19154.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 2368] + - [33, 19289.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [10, 19534.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [17, 17394.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [37, 20276.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [33, 17438.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 2944] + - [38, 19695.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [33, 19123.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 1856] + - [33, 18887.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 1024] + - [0, 15628.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 3584] + - [0, 19056.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [37, 20940.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [5, 19272.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [33, 19088.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [4, 20391.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [17, 15807.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [33, 17118.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [33, 15100.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [37, 19748.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 704] + - [33, 15720.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [10, 19624.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 5888] + - [1, 20003.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [38, 17591.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [37, 19411.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [10, 19457.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [20, 19040.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 4288] + - [9, 19165.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 1856] + - [0, 16997.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [21, 19430.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [37, 20378.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 2368] + - [33, 18295.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 4288] + - [0, 17791.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [0, 18215.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [33, 19333.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 6784] + - [1, 19927.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [1, 19829.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [33, 16157.0] + - - [448, 4288, 1, 128, 448, 448, 448, 4288] + - [33, 13661.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [4, 20216.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [19, 16931.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 4288] + - [17, 18391.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [38, 17504.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [37, 19927.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [17, 19565.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [36, 19428.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [5, 18679.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [1, 20133.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 2368] + - [17, 18712.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [21, 20199.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [25, 20146.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 3584] + - [15, 17534.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [17, 17547.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 6784] + - [34, 19702.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [17, 18827.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [0, 16697.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [1, 19647.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [4, 20525.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 5888] + - [1, 19832.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 5888] + - [18, 19353.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [20, 19304.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [33, 19492.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [18, 19276.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [21, 20267.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [38, 17393.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 704] + - [33, 17823.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [33, 18936.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [5, 17319.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 2368] + - [0, 17235.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [4, 20778.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [17, 18087.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 6784] + - [43, 19735.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [33, 17242.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [33, 15508.0] + - - [256, 5888, 1, 128, 256, 256, 256, 5888] + - [0, 14356.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [4, 20180.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [33, 19373.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [3, 18734.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 4288] + - [33, 19496.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [33, 18435.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 704] + - [33, 16478.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 5056] + - [0, 19334.0] + - - [448, 5056, 1, 128, 448, 448, 448, 5056] + - [0, 13715.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 5056] + - [33, 18423.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 3584] + - [38, 19433.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [0, 19295.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [4, 20311.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 5056] + - [17, 18491.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [14, 20567.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [4, 20422.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [21, 20223.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [33, 17904.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [33, 16303.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [4, 20449.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [37, 20856.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [4, 20277.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 3584] + - [38, 19593.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 1856] + - [9, 16234.0] + - - [704, 3584, 1, 128, 704, 704, 704, 3584] + - [17, 14614.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [17, 17507.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [34, 19668.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 2944] + - [5, 18992.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [4, 20132.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [21, 20251.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [38, 19757.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [33, 18389.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 3584] + - [38, 19449.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [17, 17486.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [21, 19416.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [38, 16905.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [37, 20705.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [5, 18570.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [33, 17769.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 4288] + - [33, 19550.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [33, 18648.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [20, 19200.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [38, 18099.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [1, 19378.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 3584] + - [33, 17322.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 1408] + - [0, 15177.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 2944] + - [17, 17918.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 2944] + - [5, 18999.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [21, 20409.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 2368] + - [33, 16842.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 2368] + - [33, 19153.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [37, 20463.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [21, 20118.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [9, 18503.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [37, 19766.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [3, 19091.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 5056] + - [17, 19329.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [34, 18974.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [37, 20121.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [5, 15879.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [4, 20248.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [20, 19107.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 448] + - [33, 14914.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [17, 18604.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [33, 18753.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [17, 18816.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [4, 20793.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [37, 20104.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [38, 18651.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [1, 20100.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [38, 19381.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 1408] + - [17, 17408.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [37, 20699.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [24, 19312.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [37, 19313.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [1, 19706.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [9, 18585.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [33, 18075.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [1, 20388.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [5, 18607.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [22, 17257.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [33, 18316.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [33, 19224.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 2944] + - [22, 19009.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [37, 20673.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [20, 19724.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [1, 19491.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [21, 20370.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [47, 16743.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [37, 19234.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [38, 18487.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [38, 17078.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [33, 18216.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [33, 19373.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [17, 17476.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 2368] + - [33, 16015.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [37, 19819.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [33, 18019.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [4, 20099.0] + - - [448, 6784, 1, 128, 448, 448, 448, 6784] + - [17, 14647.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [4, 20304.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [0, 18654.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [34, 19095.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [21, 19288.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 448] + - [17, 14813.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [17, 15989.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [38, 17718.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [17, 19458.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [0, 17085.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [3, 19514.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [33, 17522.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [33, 19483.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [33, 18750.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [17, 17775.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [0, 17381.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 2368] + - [0, 17679.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 1408] + - [33, 18874.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [1, 19863.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [17, 18761.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 3584] + - [17, 19076.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [46, 19979.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 1024] + - [33, 14498.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [5, 15987.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [37, 19649.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [37, 19614.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [22, 16607.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [1, 18875.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [17, 16336.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [38, 15785.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [5, 17516.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 1024] + - [22, 18879.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [0, 18114.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 5056] + - [33, 19280.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [34, 19221.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [4, 19745.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 6784] + - [34, 19263.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [37, 20758.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 1856] + - [9, 18062.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 2944] + - [0, 17372.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 448] + - [17, 16899.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [36, 16254.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 1856] + - [9, 16965.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [9, 16918.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [33, 17943.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 1024] + - [33, 17463.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [22, 16835.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [20, 19257.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [18, 19805.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [22, 15276.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [21, 20059.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 1024] + - [33, 18340.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [37, 20472.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [1, 20011.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [33, 18378.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [22, 18904.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 1856] + - [33, 18271.0] + - - [256, 6784, 1, 128, 256, 256, 256, 6784] + - [33, 16179.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 6784] + - [38, 19579.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 5056] + - [0, 19080.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 5888] + - [0, 18276.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [1, 19953.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [0, 18967.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [1, 20032.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [38, 17156.0] + - - [704, 5888, 1, 128, 704, 704, 704, 5888] + - [38, 16088.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 3584] + - [1, 19593.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [37, 20217.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 1408] + - [33, 15454.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [24, 19004.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [0, 18068.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [17, 13943.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [33, 17404.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [17, 18456.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [33, 18260.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [4, 20205.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [34, 20102.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [3, 18911.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [37, 19677.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [5, 16137.0] + - - [704, 4288, 1, 128, 704, 704, 704, 4288] + - [22, 14907.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 4288] + - [0, 18389.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 6784] + - [22, 18720.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [33, 18287.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [47, 16207.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [22, 17745.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 5056] + - [24, 19543.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [17, 14360.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [3, 19726.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [17, 17392.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [20, 18494.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 256] + - [9, 12764.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 5888] + - [33, 18630.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [33, 16792.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 1856] + - [17, 15655.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [37, 20648.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [38, 17367.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [4, 20070.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [13, 18712.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [1, 18906.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [1, 19973.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [37, 18703.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [37, 19634.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [33, 17919.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [0, 19209.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 1024] + - [33, 16777.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [21, 20123.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [37, 20842.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 5056] + - [33, 19534.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [21, 19388.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [33, 15241.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [17, 14942.0] + - - [704, 2368, 1, 128, 704, 704, 704, 2368] + - [17, 13523.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 256] + - [33, 14166.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 1856] + - [33, 18566.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 4288] + - [0, 19011.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [33, 17631.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [17, 18736.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [5, 18899.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [33, 16740.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 6784] + - [1, 19927.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [37, 20593.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [33, 17927.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [37, 20216.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [33, 19062.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 5888] + - [38, 19713.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [1, 19910.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [0, 16293.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [0, 18731.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [1, 19269.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [1, 19388.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [4, 19252.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [33, 18246.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [37, 20698.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [37, 19272.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [33, 17405.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [22, 17104.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [38, 16507.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [0, 18698.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [33, 19129.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [9, 14558.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [21, 19818.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [38, 17274.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [37, 20485.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [1, 19141.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [33, 18475.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [9, 18640.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 1408] + - [33, 18584.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [37, 19688.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [1, 19552.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [9, 18630.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [22, 17979.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 1024] + - [5, 18256.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [1, 20031.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 256] + - [38, 16016.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 704] + - [17, 17419.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [0, 19385.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [20, 19248.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [5, 18842.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [34, 19772.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [33, 17662.0] + - - [704, 6784, 1, 128, 704, 704, 704, 6784] + - [33, 16549.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [1, 19811.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [37, 20563.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [34, 19807.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 6784] + - [38, 19015.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [5, 15919.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [43, 19673.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [33, 18089.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 5056] + - [9, 18019.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [21, 20832.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [17, 15694.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 448] + - [9, 15878.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 2944] + - [38, 19405.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [5, 19094.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [21, 19720.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [22, 18607.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 4288] + - [42, 19158.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [14, 20358.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [17, 15644.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 1408] + - [22, 17952.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [18, 19237.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [33, 17986.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [37, 20764.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [22, 18722.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [36, 17627.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [45, 19796.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 2944] + - [0, 16309.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [33, 18587.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [21, 20371.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [9, 18068.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [34, 20014.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [22, 18858.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [5, 18530.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 1856] + - [33, 19228.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 704] + - [33, 15496.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [47, 16641.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [46, 20304.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [37, 20795.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 1408] + - [2, 16117.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 1024] + - [33, 15431.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [37, 20545.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [17, 19161.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [38, 18682.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 4288] + - [33, 19236.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [18, 19528.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [5, 17685.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [33, 16446.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 1856] + - [17, 18473.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [0, 18889.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [4, 20097.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [17, 17378.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [37, 20904.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [5, 16919.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [33, 18224.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 1024] + - [0, 17062.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [4, 15443.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [0, 16756.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [22, 16839.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 10082.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [33, 17074.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [3, 14704.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 11077.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [17, 15910.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [17, 15030.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [17, 15136.0] + - - [5329, 160, 64, 64, 5329, 5329, 5329, 160] + - [0, 7550.0] + - - [1225, 384, 64, 192, 1225, 1225, 1225, 384] + - [17, 18282.0] + - - [289, 1024, 64, 256, 289, 289, 289, 1024] + - [22, 14868.0] + - - [1225, 384, 64, 64, 1225, 1225, 1225, 384] + - [0, 15328.0] + - - [1225, 384, 64, 96, 1225, 1225, 1225, 384] + - [0, 14647.0] + - - [289, 1024, 64, 384, 289, 289, 289, 1024] + - [1, 15079.0] + - - [289, 1024, 64, 192, 289, 289, 289, 1024] + - [5, 14752.0] + - - [289, 1024, 64, 128, 289, 289, 289, 1024] + - [17, 14493.0] + - - [4096, 1024, 1, 2984, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [1024, 4096, 1, 3437, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [1024, 4096, 1, 3235, 1024, 1024, 1024, 4096] + - [0, 18593.0] + - - [4096, 1024, 1, 4032, 4096, 4096, 4096, 1024] + - [33, 18603.0] + - - [1024, 4096, 1, 3334, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3288, 4096, 4096, 4096, 1024] + - [17, 18648.0] + - - [1024, 4096, 1, 3515, 1024, 1024, 1024, 4096] + - [17, 18586.0] + - - [4096, 1024, 1, 3437, 4096, 4096, 4096, 1024] + - [42, 18614.0] + - - [1024, 4096, 1, 3259, 1024, 1024, 1024, 4096] + - [33, 18594.0] + - - [1024, 4096, 1, 3384, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3458, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [1024, 4096, 1, 3412, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3529, 1024, 1024, 1024, 4096] + - [9, 18595.0] + - - [1024, 4096, 1, 4032, 1024, 1024, 1024, 4096] + - [33, 18614.0] + - - [4096, 1024, 1, 3999, 4096, 4096, 4096, 1024] + - [17, 18604.0] + - - [1024, 4096, 1, 3079, 1024, 1024, 1024, 4096] + - [17, 18581.0] + - - [1024, 4096, 1, 3876, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 3450, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3256, 1024, 1024, 1024, 4096] + - [17, 18608.0] + - - [4096, 1024, 1, 3403, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 4096, 1, 3359, 1024, 1024, 1024, 4096] + - [24, 18604.0] + - - [4096, 1024, 1, 3549, 4096, 4096, 4096, 1024] + - [33, 18598.0] + - - [4096, 1024, 1, 3176, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [1024, 4096, 1, 3504, 1024, 1024, 1024, 4096] + - [42, 18608.0] + - - [4096, 1024, 1, 3314, 4096, 4096, 4096, 1024] + - [33, 18607.0] + - - [4096, 1024, 1, 3183, 4096, 4096, 4096, 1024] + - [17, 18575.0] + - - [1024, 4096, 1, 3209, 1024, 1024, 1024, 4096] + - [0, 18615.0] + - - [1024, 4096, 1, 3720, 1024, 1024, 1024, 4096] + - [0, 18637.0] + - - [1024, 4096, 1, 3859, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [1024, 33708, 1, 4059, 1024, 1024, 1024, 33708] + - [4, 20849.0] + - - [4096, 1024, 1, 3477, 4096, 4096, 4096, 1024] + - [24, 18588.0] + - - [4096, 1024, 1, 3233, 4096, 4096, 4096, 1024] + - [17, 18590.0] + - - [4096, 1024, 1, 3409, 4096, 4096, 4096, 1024] + - [33, 18591.0] + - - [4096, 1024, 1, 3564, 4096, 4096, 4096, 1024] + - [0, 18606.0] + - - [4096, 1024, 1, 3190, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3288, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3451, 4096, 4096, 4096, 1024] + - [0, 18590.0] + - - [1024, 4096, 1, 3348, 1024, 1024, 1024, 4096] + - [9, 18597.0] + - - [1024, 4096, 1, 3465, 1024, 1024, 1024, 4096] + - [0, 18605.0] + - - [1024, 33708, 1, 4032, 1024, 1024, 1024, 33708] + - [21, 20870.0] + - - [1024, 33708, 1, 3840, 1024, 1024, 1024, 33708] + - [4, 20855.0] + - - [4096, 1024, 1, 3391, 4096, 4096, 4096, 1024] + - [33, 18593.0] + - - [1024, 4096, 1, 3530, 1024, 1024, 1024, 4096] + - [33, 18585.0] + - - [4096, 1024, 1, 3209, 4096, 4096, 4096, 1024] + - [33, 18603.0] + - - [1024, 4096, 1, 3457, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3386, 1024, 1024, 1024, 4096] + - [0, 18595.0] + - - [4096, 1024, 1, 3350, 4096, 4096, 4096, 1024] + - [17, 18587.0] + - - [1024, 4096, 1, 3184, 1024, 1024, 1024, 4096] + - [17, 18595.0] + - - [1024, 4096, 1, 3093, 1024, 1024, 1024, 4096] + - [0, 18621.0] + - - [1024, 4096, 1, 3400, 1024, 1024, 1024, 4096] + - [33, 18607.0] + - - [1024, 4096, 1, 3214, 1024, 1024, 1024, 4096] + - [0, 18592.0] + - - [4096, 1024, 1, 3406, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3565, 1024, 1024, 1024, 4096] + - [17, 18600.0] + - - [4096, 1024, 1, 3536, 4096, 4096, 4096, 1024] + - [33, 18597.0] + - - [1024, 4096, 1, 3183, 1024, 1024, 1024, 4096] + - [17, 18596.0] + - - [1024, 4096, 1, 3462, 1024, 1024, 1024, 4096] + - [33, 18598.0] + - - [4096, 1024, 1, 3130, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3381, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3298, 4096, 4096, 4096, 1024] + - [42, 18624.0] + - - [1024, 4096, 1, 3292, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [4096, 1024, 1, 3289, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3379, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3990, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3540, 1024, 1024, 1024, 4096] + - [33, 18606.0] + - - [4096, 1024, 1, 3412, 4096, 4096, 4096, 1024] + - [0, 18610.0] + - - [1024, 4096, 1, 3555, 1024, 1024, 1024, 4096] + - [42, 18603.0] + - - [1024, 4096, 1, 3518, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3189, 4096, 4096, 4096, 1024] + - [17, 18591.0] + - - [1024, 4096, 1, 3298, 1024, 1024, 1024, 4096] + - [24, 18601.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [0, 18619.0] + - - [1024, 4096, 1, 3393, 1024, 1024, 1024, 4096] + - [33, 18600.0] + - - [1024, 4096, 1, 3207, 1024, 1024, 1024, 4096] + - [9, 18586.0] + - - [4096, 1024, 1, 3487, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3431, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3378, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [4096, 1024, 1, 3529, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3460, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3336, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3501, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3584, 1024, 1024, 1024, 4096] + - [0, 18639.0] + - - [4096, 1024, 1, 2499, 4096, 4096, 4096, 1024] + - [17, 18605.0] + - - [4096, 1024, 1, 3352, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3543, 1024, 1024, 1024, 4096] + - [17, 18619.0] + - - [1024, 4096, 1, 3476, 1024, 1024, 1024, 4096] + - [0, 18629.0] + - - [1024, 33708, 1, 3822, 1024, 1024, 1024, 33708] + - [4, 20869.0] + - - [1024, 4096, 1, 3436, 1024, 1024, 1024, 4096] + - [33, 18584.0] + - - [1024, 4096, 1, 3594, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3514, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [1024, 4096, 1, 3064, 1024, 1024, 1024, 4096] + - [24, 18608.0] + - - [4096, 1024, 1, 3371, 4096, 4096, 4096, 1024] + - [42, 18613.0] + - - [4096, 1024, 1, 3558, 4096, 4096, 4096, 1024] + - [33, 18642.0] + - - [4096, 1024, 1, 3517, 4096, 4096, 4096, 1024] + - [17, 18601.0] + - - [4096, 1024, 1, 3144, 4096, 4096, 4096, 1024] + - [0, 18616.0] + - - [1024, 4096, 1, 3312, 1024, 1024, 1024, 4096] + - [24, 18606.0] + - - [4096, 1024, 1, 3079, 4096, 4096, 4096, 1024] + - [17, 18619.0] + - - [1024, 4096, 1, 3415, 1024, 1024, 1024, 4096] + - [9, 18609.0] + - - [1024, 4096, 1, 3221, 1024, 1024, 1024, 4096] + - [17, 18613.0] + - - [1024, 4096, 1, 3978, 1024, 1024, 1024, 4096] + - [33, 18599.0] + - - [4096, 1024, 1, 3876, 4096, 4096, 4096, 1024] + - [42, 18623.0] + - - [1024, 4096, 1, 3528, 1024, 1024, 1024, 4096] + - [24, 18609.0] + - - [1024, 4096, 1, 3181, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3445, 4096, 4096, 4096, 1024] + - [0, 18617.0] + - - [4096, 1024, 1, 3450, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [4096, 1024, 1, 3377, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3532, 1024, 1024, 1024, 4096] + - [33, 18622.0] + - - [1024, 33708, 1, 3944, 1024, 1024, 1024, 33708] + - [21, 20878.0] + - - [4096, 1024, 1, 3483, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3358, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3464, 4096, 4096, 4096, 1024] + - [33, 18610.0] + - - [4096, 1024, 1, 3282, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [4096, 1024, 1, 3256, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [1024, 4096, 1, 3057, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 3481, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3340, 4096, 4096, 4096, 1024] + - [42, 18617.0] + - - [1024, 4096, 1, 3273, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [4096, 1024, 1, 3392, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3337, 4096, 4096, 4096, 1024] + - [17, 18615.0] + - - [4096, 1024, 1, 3359, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [4096, 1024, 1, 3498, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3169, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 33708, 1, 3859, 1024, 1024, 1024, 33708] + - [37, 20881.0] + - - [1024, 4096, 1, 3103, 1024, 1024, 1024, 4096] + - [9, 18583.0] + - - [4096, 1024, 1, 3900, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3442, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3248, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3351, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [4096, 1024, 1, 3593, 4096, 4096, 4096, 1024] + - [17, 18643.0] + - - [1024, 4096, 1, 3780, 1024, 1024, 1024, 4096] + - [17, 18648.0] + - - [1024, 33708, 1, 3681, 1024, 1024, 1024, 33708] + - [21, 20865.0] + - - [4096, 1024, 1, 3374, 4096, 4096, 4096, 1024] + - [17, 18607.0] + - - [1024, 4096, 1, 3557, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3906, 4096, 4096, 4096, 1024] + - [17, 18600.0] + - - [4096, 1024, 1, 3504, 4096, 4096, 4096, 1024] + - [33, 18618.0] + - - [1024, 4096, 1, 3270, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [4096, 1024, 1, 3098, 4096, 4096, 4096, 1024] + - [0, 18612.0] + - - [4096, 1024, 1, 3216, 4096, 4096, 4096, 1024] + - [24, 18620.0] + - - [1024, 4096, 1, 3550, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3449, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [1024, 4096, 1, 3403, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3523, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3486, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3564, 1024, 1024, 1024, 4096] + - [0, 18634.0] + - - [1024, 33708, 1, 4005, 1024, 1024, 1024, 33708] + - [37, 20869.0] + - - [4096, 1024, 1, 3296, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3263, 1024, 1024, 1024, 4096] + - [42, 18614.0] + - - [1024, 4096, 1, 3130, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3295, 1024, 1024, 1024, 4096] + - [0, 18642.0] + - - [1024, 33708, 1, 3925, 1024, 1024, 1024, 33708] + - [4, 20879.0] + - - [1024, 4096, 1, 3378, 1024, 1024, 1024, 4096] + - [17, 18621.0] + - - [4096, 1024, 1, 3720, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3399, 4096, 4096, 4096, 1024] + - [42, 18620.0] + - - [4096, 1024, 1, 3543, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [4096, 1024, 1, 3497, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3594, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3144, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 4096, 1, 3975, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3205, 4096, 4096, 4096, 1024] + - [33, 18616.0] + - - [1024, 33708, 1, 3995, 1024, 1024, 1024, 33708] + - [4, 20864.0] + - - [1024, 4096, 1, 3392, 1024, 1024, 1024, 4096] + - [17, 18616.0] + - - [1024, 4096, 1, 3055, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 4026, 1024, 1024, 1024, 4096] + - [0, 18614.0] + - - [4096, 1024, 1, 3557, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [4096, 1024, 1, 3515, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3486, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3457, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [1024, 4096, 1, 3511, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [4096, 1024, 1, 3138, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [1024, 4096, 1, 3339, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3939, 1024, 1024, 1024, 4096] + - [17, 18639.0] + - - [4096, 1024, 1, 3500, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [4096, 1024, 1, 3395, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 4020, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3942, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3349, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 4096, 1, 3322, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [4096, 1024, 1, 3452, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [1024, 4096, 1, 3417, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 3526, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [4096, 1024, 1, 3485, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3303, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [4096, 1024, 1, 3344, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3479, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [4096, 1024, 1, 3300, 4096, 4096, 4096, 1024] + - [17, 18614.0] + - - [1024, 4096, 1, 3439, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [4096, 1024, 1, 3280, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3245, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3328, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3418, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [1024, 4096, 1, 3493, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3500, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [1024, 4096, 1, 3166, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [4096, 1024, 1, 3126, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3277, 1024, 1024, 1024, 4096] + - [17, 18623.0] + - - [1024, 4096, 1, 3315, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [1024, 4096, 1, 3414, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3531, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [4096, 1024, 1, 3484, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3180, 1024, 1024, 1024, 4096] + - [17, 18618.0] + - - [4096, 1024, 1, 3360, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 33708, 1, 3990, 1024, 1024, 1024, 33708] + - [4, 20876.0] + - - [4096, 1024, 1, 3466, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3428, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [1024, 4096, 1, 3137, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 4059, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [1024, 4096, 1, 3353, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [1024, 4096, 1, 3942, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3506, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3508, 4096, 4096, 4096, 1024] + - [17, 18609.0] + - - [4096, 1024, 1, 3956, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3272, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3443, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3375, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 4096, 1, 3525, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3472, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [1024, 4096, 1, 3520, 1024, 1024, 1024, 4096] + - [33, 18645.0] + - - [4096, 1024, 1, 3322, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 3387, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 33708, 1, 3939, 1024, 1024, 1024, 33708] + - [37, 20880.0] + - - [4096, 1024, 1, 3345, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 2967, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3453, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 4096, 1, 3640, 1024, 1024, 1024, 4096] + - [33, 18645.0] + - - [4096, 1024, 1, 3291, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3350, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3417, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [1024, 4096, 1, 3467, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3491, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3822, 1024, 1024, 1024, 4096] + - [17, 18649.0] + - - [4096, 1024, 1, 3292, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [1024, 4096, 1, 3231, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3364, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3995, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3545, 1024, 1024, 1024, 4096] + - [17, 18655.0] + - - [1024, 4096, 1, 3186, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3432, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3367, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3503, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3095, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [4096, 1024, 1, 3465, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3402, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3140, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3424, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3257, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [4096, 1024, 1, 2917, 4096, 4096, 4096, 1024] + - [42, 18621.0] + - - [1024, 33708, 1, 3640, 1024, 1024, 1024, 33708] + - [37, 20877.0] + - - [1024, 4096, 1, 3456, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [1024, 4096, 1, 3014, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [4096, 1024, 1, 3372, 4096, 4096, 4096, 1024] + - [33, 18640.0] + - - [1024, 4096, 1, 3294, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3446, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3389, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3259, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3544, 4096, 4096, 4096, 1024] + - [17, 18658.0] + - - [4096, 1024, 1, 3479, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3542, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3321, 4096, 4096, 4096, 1024] + - [42, 18634.0] + - - [1024, 4096, 1, 3147, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 4096, 1, 3944, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [4096, 1024, 1, 3870, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3308, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3401, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 4096, 1, 3395, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [1024, 4096, 1, 3563, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 33708, 1, 3870, 1024, 1024, 1024, 33708] + - [4, 20871.0] + - - [4096, 1024, 1, 3494, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3271, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [1024, 33708, 1, 3910, 1024, 1024, 1024, 33708] + - [37, 20861.0] + - - [1024, 4096, 1, 3287, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [1024, 33708, 1, 3860, 1024, 1024, 1024, 33708] + - [4, 20855.0] + - - [4096, 1024, 1, 3341, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3136, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3439, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [1024, 4096, 1, 3751, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3301, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [4096, 1024, 1, 3468, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3416, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3163, 4096, 4096, 4096, 1024] + - [33, 18617.0] + - - [1024, 4096, 1, 3230, 1024, 1024, 1024, 4096] + - [0, 18610.0] + - - [1024, 4096, 1, 3581, 1024, 1024, 1024, 4096] + - [24, 18604.0] + - - [4096, 1024, 1, 3463, 4096, 4096, 4096, 1024] + - [24, 18612.0] + - - [1024, 4096, 1, 3478, 1024, 1024, 1024, 4096] + - [0, 18613.0] + - - [4096, 1024, 1, 3262, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3438, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3244, 1024, 1024, 1024, 4096] + - [0, 18641.0] + - - [1024, 4096, 1, 3445, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 4096, 1024] + - [42, 18622.0] + - - [1024, 4096, 1, 3492, 1024, 1024, 1024, 4096] + - [0, 18627.0] + - - [4096, 1024, 1, 3211, 4096, 4096, 4096, 1024] + - [42, 18610.0] + - - [1024, 4096, 1, 3910, 1024, 1024, 1024, 4096] + - [0, 18616.0] + - - [1024, 4096, 1, 3314, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3859, 4096, 4096, 4096, 1024] + - [42, 18616.0] + - - [4096, 1024, 1, 3383, 4096, 4096, 4096, 1024] + - [42, 18616.0] + - - [1024, 4096, 1, 3409, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 4020, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3530, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3411, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [1024, 4096, 1, 3566, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3493, 4096, 4096, 4096, 1024] + - [24, 18628.0] + - - [4096, 1024, 1, 3184, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3431, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 3306, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [1024, 4096, 1, 3352, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 3295, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3517, 1024, 1024, 1024, 4096] + - [0, 18644.0] + - - [4096, 1024, 1, 3426, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3385, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3572, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3459, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [1024, 4096, 1, 3374, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3166, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [4096, 1024, 1, 3093, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3523, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3413, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 3996, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3452, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3232, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3400, 4096, 4096, 4096, 1024] + - [17, 18646.0] + - - [4096, 1024, 1, 3334, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3345, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3538, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3466, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [4096, 1024, 1, 3315, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [4096, 1024, 1, 3214, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [1024, 33708, 1, 3900, 1024, 1024, 1024, 33708] + - [37, 20876.0] + - - [1024, 4096, 1, 3367, 1024, 1024, 1024, 4096] + - [33, 18622.0] + - - [1024, 4096, 1, 2917, 1024, 1024, 1024, 4096] + - [33, 18623.0] + - - [1024, 4096, 1, 3544, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3414, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [4096, 1024, 1, 3565, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3512, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3191, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [1024, 4096, 1, 3289, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3290, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3211, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 33708, 1, 3969, 1024, 1024, 1024, 33708] + - [37, 20880.0] + - - [4096, 1024, 1, 3566, 4096, 4096, 4096, 1024] + - [24, 18604.0] + - - [1024, 4096, 1, 3459, 1024, 1024, 1024, 4096] + - [33, 18642.0] + - - [1024, 4096, 1, 3372, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3339, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [4096, 1024, 1, 3425, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [4096, 1024, 1, 3388, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3531, 1024, 1024, 1024, 4096] + - [33, 18621.0] + - - [4096, 1024, 1, 3286, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [4096, 1024, 1, 3462, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [1024, 4096, 1, 3388, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [4096, 1024, 1, 3165, 4096, 4096, 4096, 1024] + - [0, 18607.0] + - - [4096, 1024, 1, 3304, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [1024, 4096, 1, 2736, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3397, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [1024, 4096, 1, 3311, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [1024, 4096, 1, 3394, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 2736, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3559, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3180, 4096, 4096, 4096, 1024] + - [33, 18614.0] + - - [1024, 4096, 1, 3480, 1024, 1024, 1024, 4096] + - [0, 18642.0] + - - [4096, 1024, 1, 3318, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3213, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [1024, 4096, 1, 3286, 1024, 1024, 1024, 4096] + - [17, 18639.0] + - - [4096, 1024, 1, 3471, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3381, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3502, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [1024, 4096, 1, 3552, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [4096, 1024, 1, 3519, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3300, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 4096, 1, 3419, 1024, 1024, 1024, 4096] + - [17, 18630.0] + - - [4096, 1024, 1, 4030, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3976, 4096, 4096, 4096, 1024] + - [17, 18648.0] + - - [1024, 4096, 1, 3473, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [4096, 1024, 1, 3428, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [1024, 4096, 1, 3433, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3534, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3461, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3681, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3495, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3351, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [1024, 4096, 1, 4059, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3990, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3325, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 4096, 1, 3408, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [4096, 1024, 1, 3394, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3573, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [4096, 1024, 1, 3386, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3540, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3182, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [1024, 4096, 1, 3430, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3236, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 2977, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3355, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3139, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [4096, 1024, 1, 3516, 4096, 4096, 4096, 1024] + - [33, 18639.0] + - - [4096, 1024, 1, 3368, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3559, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3506, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3145, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3369, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [4096, 1024, 1, 3522, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 33708, 1, 3894, 1024, 1024, 1024, 33708] + - [4, 20875.0] + - - [4096, 1024, 1, 3336, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3382, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3533, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 4050, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3480, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3344, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [1024, 4096, 1, 3509, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [1024, 4096, 1, 3956, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 3616, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [1024, 4096, 1, 3366, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [4096, 1024, 1, 2935, 4096, 4096, 4096, 1024] + - [0, 18609.0] + - - [4096, 1024, 1, 3393, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3547, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [1024, 4096, 1, 3499, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3357, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3272, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [4096, 1024, 1, 3207, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [4096, 1024, 1, 3894, 4096, 4096, 4096, 1024] + - [33, 18633.0] + - - [1024, 4096, 1, 3444, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [4096, 1024, 1, 3561, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3376, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3458, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [4096, 1024, 1, 3231, 4096, 4096, 4096, 1024] + - [33, 18619.0] + - - [1024, 4096, 1, 3505, 1024, 1024, 1024, 4096] + - [33, 18627.0] + - - [4096, 1024, 1, 3277, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [1024, 4096, 1, 3391, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [1024, 4096, 1, 3536, 1024, 1024, 1024, 4096] + - [17, 18634.0] + - - [1024, 4096, 1, 3063, 1024, 1024, 1024, 4096] + - [17, 18619.0] + - - [1024, 4096, 1, 3189, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 2505, 1024, 1024, 1024, 4096] + - [17, 18605.0] + - - [4096, 1024, 1, 3454, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 3405, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [1024, 33708, 1, 4050, 1024, 1024, 1024, 33708] + - [21, 20879.0] + - - [4096, 1024, 1, 3520, 4096, 4096, 4096, 1024] + - [17, 18641.0] + - - [1024, 4096, 1, 3487, 1024, 1024, 1024, 4096] + - [33, 18601.0] + - - [1024, 4096, 1, 3558, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3297, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3483, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 33708, 1, 3751, 1024, 1024, 1024, 33708] + - [4, 20871.0] + - - [4096, 1024, 1, 3380, 4096, 4096, 4096, 1024] + - [24, 18603.0] + - - [1024, 4096, 1, 3380, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3396, 1024, 1024, 1024, 4096] + - [0, 18627.0] + - - [1024, 4096, 1, 3497, 1024, 1024, 1024, 4096] + - [0, 18628.0] + - - [1024, 4096, 1, 3502, 1024, 1024, 1024, 4096] + - [0, 18631.0] + - - [1024, 4096, 1, 3138, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3939, 4096, 4096, 4096, 1024] + - [33, 18640.0] + - - [1024, 4096, 1, 3303, 1024, 1024, 1024, 4096] + - [17, 18620.0] + - - [1024, 4096, 1, 3418, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3224, 1024, 1024, 1024, 4096] + - [33, 18649.0] + - - [4096, 1024, 1, 3978, 4096, 4096, 4096, 1024] + - [33, 18642.0] + - - [1024, 4096, 1, 3472, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3353, 4096, 4096, 4096, 1024] + - [0, 18624.0] + - - [4096, 1024, 1, 3362, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [1024, 33708, 1, 3978, 1024, 1024, 1024, 33708] + - [4, 20873.0] + - - [1024, 4096, 1, 3432, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3139, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 4096, 1, 3341, 1024, 1024, 1024, 4096] + - [24, 18607.0] + - - [1024, 4096, 1, 3494, 1024, 1024, 1024, 4096] + - [33, 18646.0] + - - [1024, 4096, 1, 3969, 1024, 1024, 1024, 4096] + - [17, 18643.0] + - - [1024, 4096, 1, 3163, 1024, 1024, 1024, 4096] + - [9, 18598.0] + - - [4096, 1024, 1, 3405, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [4096, 1024, 1, 3453, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [1024, 4096, 1, 3411, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 3527, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3474, 4096, 4096, 4096, 1024] + - [33, 18629.0] + - - [1024, 4096, 1, 3572, 1024, 1024, 1024, 4096] + - [17, 18623.0] + - - [4096, 1024, 1, 3293, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3247, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3425, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [1024, 4096, 1, 3354, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [4096, 1024, 1, 3382, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3236, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3519, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3354, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3501, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3266, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [1024, 4096, 1, 3368, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 4096, 1, 4030, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 3533, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3332, 4096, 4096, 4096, 1024] + - [17, 18619.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3616, 1024, 1024, 1024, 4096] + - [0, 18618.0] + - - [4096, 1024, 1, 3265, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3361, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3467, 4096, 4096, 4096, 1024] + - [33, 18647.0] + - - [1024, 4096, 1, 3454, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3101, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3508, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [4096, 1024, 1, 3267, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3419, 4096, 4096, 4096, 1024] + - [17, 18636.0] + - - [4096, 1024, 1, 3822, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3266, 1024, 1024, 1024, 4096] + - [17, 18624.0] + - - [4096, 1024, 1, 3440, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3361, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 4096, 1, 3546, 1024, 1024, 1024, 4096] + - [33, 18647.0] + - - [4096, 1024, 1, 3473, 4096, 4096, 4096, 1024] + - [0, 18618.0] + - - [4096, 1024, 1, 3546, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3088, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3535, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [1024, 4096, 1, 3447, 1024, 1024, 1024, 4096] + - [33, 18652.0] + - - [1024, 4096, 1, 3560, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [1024, 4096, 1, 3422, 1024, 1024, 1024, 4096] + - [0, 18636.0] + - - [1024, 4096, 1, 3469, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3488, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3110, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 3265, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [1024, 4096, 1, 3291, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3390, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [4096, 1024, 1, 3046, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 4096, 1, 3539, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3221, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [4096, 1024, 1, 3433, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3364, 4096, 4096, 4096, 1024] + - [17, 18637.0] + - - [4096, 1024, 1, 3470, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3404, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [1024, 33708, 1, 3968, 1024, 1024, 1024, 33708] + - [21, 20878.0] + - - [4096, 1024, 1, 3088, 4096, 4096, 4096, 1024] + - [33, 18620.0] + - - [1024, 4096, 1, 3247, 1024, 1024, 1024, 4096] + - [42, 18603.0] + - - [1024, 33708, 1, 3996, 1024, 1024, 1024, 33708] + - [37, 20870.0] + - - [4096, 1024, 1, 3482, 4096, 4096, 4096, 1024] + - [17, 18628.0] + - - [4096, 1024, 1, 3995, 4096, 4096, 4096, 1024] + - [17, 18627.0] + - - [1024, 4096, 1, 3280, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [4096, 1024, 1, 3271, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3545, 4096, 4096, 4096, 1024] + - [17, 18651.0] + - - [4096, 1024, 1, 3476, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3496, 4096, 4096, 4096, 1024] + - [24, 18633.0] + - - [4096, 1024, 1, 3191, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [4096, 1024, 1, 3311, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3302, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3681, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [4096, 1024, 1, 3582, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [4096, 1024, 1, 3421, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3560, 4096, 4096, 4096, 1024] + - [17, 18649.0] + - - [1024, 4096, 1, 3495, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [4096, 1024, 1, 3186, 4096, 4096, 4096, 1024] + - [33, 18631.0] + - - [4096, 1024, 1, 3925, 4096, 4096, 4096, 1024] + - [0, 18615.0] + - - [1024, 4096, 1, 3435, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [4096, 1024, 1, 3434, 4096, 4096, 4096, 1024] + - [33, 18624.0] + - - [1024, 33708, 1, 4012, 1024, 1024, 1024, 33708] + - [21, 20868.0] + - - [1024, 4096, 1, 3340, 1024, 1024, 1024, 4096] + - [17, 18612.0] + - - [4096, 1024, 1, 3489, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3162, 1024, 1024, 1024, 4096] + - [17, 18622.0] + - - [4096, 1024, 1, 3436, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3574, 4096, 4096, 4096, 1024] + - [17, 18639.0] + - - [4096, 1024, 1, 3469, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3410, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [1024, 4096, 1, 3216, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3095, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3448, 4096, 4096, 4096, 1024] + - [17, 18635.0] + - - [1024, 4096, 1, 3176, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 2918, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3424, 1024, 1024, 1024, 4096] + - [0, 18640.0] + - - [4096, 1024, 1, 3402, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [4096, 1024, 1, 3145, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 33708, 1, 3976, 1024, 1024, 1024, 33708] + - [37, 20877.0] + - - [4096, 1024, 1, 3518, 4096, 4096, 4096, 1024] + - [17, 18615.0] + - - [4096, 1024, 1, 3110, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [4096, 1024, 1, 3325, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 33708, 1, 3999, 1024, 1024, 1024, 33708] + - [4, 20870.0] + - - [4096, 1024, 1, 2985, 4096, 4096, 4096, 1024] + - [33, 18619.0] + - - [1024, 4096, 1, 3371, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3342, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [4096, 1024, 1, 3141, 4096, 4096, 4096, 1024] + - [0, 18601.0] + - - [4096, 1024, 1, 3532, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3169, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [1024, 4096, 1, 3514, 1024, 1024, 1024, 4096] + - [17, 18643.0] + - - [4096, 1024, 1, 3780, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [1024, 4096, 1, 3098, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3449, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [1024, 4096, 1, 3222, 1024, 1024, 1024, 4096] + - [17, 18626.0] + - - [1024, 4096, 1, 3346, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3064, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [4096, 1024, 1, 3511, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3384, 4096, 4096, 4096, 1024] + - [17, 18640.0] + - - [4096, 1024, 1, 3356, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3796, 1024, 1024, 1024, 4096] + - [17, 18638.0] + - - [4096, 1024, 1, 3427, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3390, 4096, 4096, 4096, 1024] + - [0, 18618.0] + - - [4096, 1024, 1, 3573, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [4096, 1024, 1, 3456, 4096, 4096, 4096, 1024] + - [17, 18650.0] + - - [1024, 4096, 1, 3360, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [1024, 33708, 1, 3977, 1024, 1024, 1024, 33708] + - [37, 20865.0] + - - [1024, 4096, 1, 2918, 1024, 1024, 1024, 4096] + - [17, 18578.0] + - - [4096, 1024, 1, 3975, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [4096, 1024, 1, 3525, 4096, 4096, 4096, 1024] + - [42, 18622.0] + - - [4096, 1024, 1, 3398, 4096, 4096, 4096, 1024] + - [33, 18622.0] + - - [4096, 1024, 1, 3640, 4096, 4096, 4096, 1024] + - [33, 18648.0] + - - [4096, 1024, 1, 3014, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3446, 1024, 1024, 1024, 4096] + - [33, 18601.0] + - - [1024, 33708, 1, 3796, 1024, 1024, 1024, 33708] + - [4, 20852.0] + - - [4096, 1024, 1, 3101, 4096, 4096, 4096, 1024] + - [33, 18585.0] + - - [4096, 1024, 1, 3563, 4096, 4096, 4096, 1024] + - [33, 18609.0] + - - [4096, 1024, 1, 3539, 4096, 4096, 4096, 1024] + - [42, 18607.0] + - - [4096, 1024, 1, 3182, 4096, 4096, 4096, 1024] + - [17, 18621.0] + - - [1024, 4096, 1, 3468, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [4096, 1024, 1, 3312, 4096, 4096, 4096, 1024] + - [17, 18598.0] + - - [4096, 1024, 1, 3215, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [4096, 1024, 1, 3910, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 33708, 1, 3780, 1024, 1024, 1024, 33708] + - [4, 20864.0] + - - [1024, 4096, 1, 3290, 1024, 1024, 1024, 4096] + - [33, 18588.0] + - - [1024, 4096, 1, 4012, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 4096, 1, 3385, 1024, 1024, 1024, 4096] + - [42, 18607.0] + - - [1024, 33708, 1, 3975, 1024, 1024, 1024, 33708] + - [4, 20876.0] + - - [4096, 1024, 1, 3996, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [4096, 1024, 1, 2765, 4096, 4096, 4096, 1024] + - [17, 18590.0] + - - [4096, 1024, 1, 3538, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [4096, 1024, 1, 3415, 4096, 4096, 4096, 1024] + - [42, 18633.0] + - - [1024, 4096, 1, 3554, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [4096, 1024, 1, 3513, 4096, 4096, 4096, 1024] + - [33, 18595.0] + - - [1024, 4096, 1, 3304, 1024, 1024, 1024, 4096] + - [9, 18613.0] + - - [4096, 1024, 1, 3294, 4096, 4096, 4096, 1024] + - [0, 18617.0] + - - [4096, 1024, 1, 3396, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3213, 1024, 1024, 1024, 4096] + - [24, 18602.0] + - - [4096, 1024, 1, 3137, 4096, 4096, 4096, 1024] + - [17, 18624.0] + - - [4096, 1024, 1, 3552, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3461, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [4096, 1024, 1, 3263, 4096, 4096, 4096, 1024] + - [42, 18604.0] + - - [4096, 1024, 1, 3430, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [4096, 1024, 1, 3389, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [4096, 1024, 1, 3528, 4096, 4096, 4096, 1024] + - [33, 18610.0] + - - [1024, 4096, 1, 3463, 1024, 1024, 1024, 4096] + - [17, 18602.0] + - - [4096, 1024, 1, 3526, 4096, 4096, 4096, 1024] + - [17, 18643.0] + - - [4096, 1024, 1, 3154, 4096, 4096, 4096, 1024] + - [17, 18620.0] + - - [4096, 1024, 1, 3499, 4096, 4096, 4096, 1024] + - [24, 18628.0] + - - [4096, 1024, 1, 3955, 4096, 4096, 4096, 1024] + - [17, 18609.0] + - - [1024, 4096, 1, 3297, 1024, 1024, 1024, 4096] + - [17, 18615.0] + - - [1024, 4096, 1, 3233, 1024, 1024, 1024, 4096] + - [17, 18605.0] + - - [1024, 4096, 1, 3226, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 3404, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [4096, 1024, 1, 3355, 4096, 4096, 4096, 1024] + - [33, 18599.0] + - - [1024, 4096, 1, 3542, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3181, 4096, 4096, 4096, 1024] + - [0, 18583.0] + - - [1024, 4096, 1, 3474, 1024, 1024, 1024, 4096] + - [17, 18631.0] + - - [4096, 1024, 1, 3319, 4096, 4096, 4096, 1024] + - [17, 18623.0] + - - [1024, 4096, 1, 3434, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [1024, 4096, 1, 3860, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3343, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [1024, 4096, 1, 3488, 1024, 1024, 1024, 4096] + - [0, 18631.0] + - - [1024, 4096, 1, 3046, 1024, 1024, 1024, 4096] + - [9, 18585.0] + - - [1024, 4096, 1, 3141, 1024, 1024, 1024, 4096] + - [24, 18602.0] + - - [1024, 4096, 1, 3516, 1024, 1024, 1024, 4096] + - [17, 18632.0] + - - [4096, 1024, 1, 3147, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3421, 1024, 1024, 1024, 4096] + - [17, 18622.0] + - - [4096, 1024, 1, 3944, 4096, 4096, 4096, 1024] + - [33, 18646.0] + - - [1024, 4096, 1, 3574, 1024, 1024, 1024, 4096] + - [24, 18601.0] + - - [1024, 4096, 1, 3977, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [1024, 4096, 1, 2985, 1024, 1024, 1024, 4096] + - [42, 18599.0] + - - [1024, 4096, 1, 3427, 1024, 1024, 1024, 4096] + - [0, 18634.0] + - - [1024, 4096, 1, 3482, 1024, 1024, 1024, 4096] + - [33, 18639.0] + - - [1024, 4096, 1, 3332, 1024, 1024, 1024, 4096] + - [0, 18632.0] + - - [4096, 1024, 1, 3308, 4096, 4096, 4096, 1024] + - [24, 18605.0] + - - [1024, 4096, 1, 3513, 1024, 1024, 1024, 4096] + - [17, 18640.0] + - - [1024, 4096, 1, 3154, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3955, 1024, 1024, 1024, 4096] + - [17, 18636.0] + - - [1024, 4096, 1, 2967, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 33708, 1, 3942, 1024, 1024, 1024, 33708] + - [4, 20880.0] + - - [1024, 4096, 1, 3319, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [4096, 1024, 1, 3860, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [1024, 4096, 1, 3548, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3977, 4096, 4096, 4096, 1024] + - [33, 18641.0] + - - [4096, 1024, 1, 3535, 4096, 4096, 4096, 1024] + - [17, 18600.0] + - - [1024, 4096, 1, 3541, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 33708, 1, 3584, 1024, 1024, 1024, 33708] + - [37, 20844.0] + - - [1024, 4096, 1, 3168, 1024, 1024, 1024, 4096] + - [33, 18632.0] + - - [1024, 4096, 1, 3448, 1024, 1024, 1024, 4096] + - [0, 18604.0] + - - [4096, 1024, 1, 3343, 4096, 4096, 4096, 1024] + - [24, 18606.0] + - - [1024, 4096, 1, 3357, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [4096, 1024, 1, 3510, 4096, 4096, 4096, 1024] + - [33, 18614.0] + - - [4096, 1024, 1, 3369, 4096, 4096, 4096, 1024] + - [33, 18589.0] + - - [4096, 1024, 1, 3379, 4096, 4096, 4096, 1024] + - [17, 18632.0] + - - [1024, 4096, 1, 3276, 1024, 1024, 1024, 4096] + - [33, 18629.0] + - - [1024, 4096, 1, 3363, 1024, 1024, 1024, 4096] + - [33, 18600.0] + - - [4096, 1024, 1, 3055, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3524, 1024, 1024, 1024, 4096] + - [33, 18604.0] + - - [4096, 1024, 1, 3057, 4096, 4096, 4096, 1024] + - [0, 18599.0] + - - [1024, 33708, 1, 3720, 1024, 1024, 1024, 33708] + - [21, 20847.0] + - - [1024, 4096, 1, 3383, 1024, 1024, 1024, 4096] + - [33, 18591.0] + - - [1024, 4096, 1, 3522, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 33708, 1, 3956, 1024, 1024, 1024, 33708] + - [37, 20849.0] + - - [1024, 4096, 1, 3481, 1024, 1024, 1024, 4096] + - [17, 18585.0] + - - [4096, 1024, 1, 3562, 4096, 4096, 4096, 1024] + - [33, 18594.0] + - - [4096, 1024, 1, 3299, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [1024, 4096, 1, 3262, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [1024, 33708, 1, 4026, 1024, 1024, 1024, 33708] + - [37, 20845.0] + - - [4096, 1024, 1, 3168, 4096, 4096, 4096, 1024] + - [33, 18588.0] + - - [1024, 4096, 1, 3999, 1024, 1024, 1024, 4096] + - [17, 18596.0] + - - [1024, 4096, 1, 3549, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3375, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [1024, 4096, 1, 3496, 1024, 1024, 1024, 4096] + - [33, 18608.0] + - - [1024, 4096, 1, 3190, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3273, 4096, 4096, 4096, 1024] + - [24, 18610.0] + - - [1024, 4096, 1, 3406, 1024, 1024, 1024, 4096] + - [17, 18590.0] + - - [4096, 1024, 1, 4005, 4096, 4096, 4096, 1024] + - [17, 18605.0] + - - [4096, 1024, 1, 3555, 4096, 4096, 4096, 1024] + - [33, 18605.0] + - - [4096, 1024, 1, 2505, 4096, 4096, 4096, 1024] + - [33, 18576.0] + - - [1024, 4096, 1, 3460, 1024, 1024, 1024, 4096] + - [33, 18607.0] + - - [1024, 4096, 1, 3579, 1024, 1024, 1024, 4096] + - [24, 18608.0] + - - [1024, 33708, 1, 4030, 1024, 1024, 1024, 33708] + - [4, 20866.0] + - - [1024, 4096, 1, 3510, 1024, 1024, 1024, 4096] + - [17, 18588.0] + - - [1024, 4096, 1, 3282, 1024, 1024, 1024, 4096] + - [17, 18591.0] + - - [1024, 4096, 1, 3377, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 2935, 1024, 1024, 1024, 4096] + - [33, 18595.0] + - - [1024, 4096, 1, 3498, 1024, 1024, 1024, 4096] + - [0, 18639.0] + - - [1024, 4096, 1, 3593, 1024, 1024, 1024, 4096] + - [0, 18604.0] + - - [4096, 1024, 1, 3226, 4096, 4096, 4096, 1024] + - [17, 18593.0] + - - [1024, 4096, 1, 2499, 1024, 1024, 1024, 4096] + - [42, 18585.0] + - - [1024, 4096, 1, 3296, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 4096, 1, 3455, 1024, 1024, 1024, 4096] + - [17, 18603.0] + - - [1024, 4096, 1, 3399, 1024, 1024, 1024, 4096] + - [33, 18631.0] + - - [1024, 4096, 1, 3205, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [4096, 1024, 1, 4026, 4096, 4096, 4096, 1024] + - [33, 18606.0] + - - [1024, 4096, 1, 3484, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3302, 4096, 4096, 4096, 1024] + - [17, 18617.0] + - - [1024, 4096, 1, 3485, 1024, 1024, 1024, 4096] + - [17, 18594.0] + - - [1024, 4096, 1, 3126, 1024, 1024, 1024, 4096] + - [42, 18620.0] + - - [1024, 4096, 1, 4050, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3235, 4096, 4096, 4096, 1024] + - [33, 18617.0] + - - [1024, 33708, 1, 3955, 1024, 1024, 1024, 33708] + - [4, 20875.0] + - - [1024, 4096, 1, 3342, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 3397, 1024, 1024, 1024, 4096] + - [0, 18625.0] + - - [4096, 1024, 1, 3491, 4096, 4096, 4096, 1024] + - [24, 18618.0] + - - [1024, 4096, 1, 3503, 1024, 1024, 1024, 4096] + - [17, 18641.0] + - - [1024, 4096, 1, 3140, 1024, 1024, 1024, 4096] + - [17, 18591.0] + - - [4096, 1024, 1, 3121, 4096, 4096, 4096, 1024] + - [42, 18619.0] + - - [4096, 1024, 1, 3276, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [1024, 4096, 1, 3321, 1024, 1024, 1024, 4096] + - [17, 18592.0] + - - [1024, 4096, 1, 3870, 1024, 1024, 1024, 4096] + - [17, 18600.0] + - - [4096, 1024, 1, 3475, 4096, 4096, 4096, 1024] + - [33, 18630.0] + - - [1024, 4096, 1, 2984, 1024, 1024, 1024, 4096] + - [0, 18602.0] + - - [4096, 1024, 1, 3363, 4096, 4096, 4096, 1024] + - [17, 18594.0] + - - [1024, 4096, 1, 3582, 1024, 1024, 1024, 4096] + - [42, 18606.0] + - - [4096, 1024, 1, 3509, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 4096, 1, 3426, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [4096, 1024, 1, 3136, 4096, 4096, 4096, 1024] + - [33, 18611.0] + - - [1024, 4096, 1, 3232, 1024, 1024, 1024, 4096] + - [33, 18606.0] + - - [4096, 1024, 1, 3103, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [1024, 4096, 1, 3335, 1024, 1024, 1024, 4096] + - [0, 18620.0] + - - [1024, 4096, 1, 3900, 1024, 1024, 1024, 4096] + - [33, 18641.0] + - - [4096, 1024, 1, 3512, 4096, 4096, 4096, 1024] + - [33, 18635.0] + - - [4096, 1024, 1, 3222, 4096, 4096, 4096, 1024] + - [24, 18609.0] + - - [1024, 4096, 1, 3165, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3408, 4096, 4096, 4096, 1024] + - [33, 18644.0] + - - [4096, 1024, 1, 3751, 4096, 4096, 4096, 1024] + - [17, 18638.0] + - - [1024, 4096, 1, 3318, 1024, 1024, 1024, 4096] + - [17, 18625.0] + - - [4096, 1024, 1, 3442, 4096, 4096, 4096, 1024] + - [33, 18636.0] + - - [1024, 4096, 1, 3413, 1024, 1024, 1024, 4096] + - [0, 18603.0] + - - [4096, 1024, 1, 3524, 4096, 4096, 4096, 1024] + - [0, 18614.0] + - - [1024, 4096, 1, 3976, 1024, 1024, 1024, 4096] + - [33, 18651.0] + - - [1024, 4096, 1, 3475, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [1024, 4096, 1, 3534, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [4096, 1024, 1, 3301, 4096, 4096, 4096, 1024] + - [42, 18623.0] + - - [4096, 1024, 1, 3248, 4096, 4096, 4096, 1024] + - [17, 18642.0] + - - [1024, 4096, 1, 2977, 1024, 1024, 1024, 4096] + - [33, 18587.0] + - - [4096, 1024, 1, 3346, 4096, 4096, 4096, 1024] + - [33, 18627.0] + - - [1024, 4096, 1, 3451, 1024, 1024, 1024, 4096] + - [33, 18643.0] + - - [1024, 4096, 1, 3257, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [1024, 4096, 1, 3356, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3348, 4096, 4096, 4096, 1024] + - [33, 18625.0] + - - [4096, 1024, 1, 3335, 4096, 4096, 4096, 1024] + - [17, 18633.0] + - - [4096, 1024, 1, 3505, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3490, 1024, 1024, 1024, 4096] + - [33, 18636.0] + - - [4096, 1024, 1, 3447, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [1024, 4096, 1, 3267, 1024, 1024, 1024, 4096] + - [33, 18628.0] + - - [4096, 1024, 1, 3230, 4096, 4096, 4096, 1024] + - [17, 18631.0] + - - [4096, 1024, 1, 3455, 4096, 4096, 4096, 1024] + - [17, 18634.0] + - - [1024, 4096, 1, 3925, 1024, 1024, 1024, 4096] + - [17, 18604.0] + - - [1024, 4096, 1, 3362, 1024, 1024, 1024, 4096] + - [17, 18629.0] + - - [4096, 1024, 1, 3969, 4096, 4096, 4096, 1024] + - [17, 18625.0] + - - [4096, 1024, 1, 3527, 4096, 4096, 4096, 1024] + - [33, 18634.0] + - - [1024, 4096, 1, 3585, 1024, 1024, 1024, 4096] + - [9, 18597.0] + - - [4096, 1024, 1, 3063, 4096, 4096, 4096, 1024] + - [0, 18602.0] + - - [4096, 1024, 1, 3435, 4096, 4096, 4096, 1024] + - [33, 18626.0] + - - [4096, 1024, 1, 3366, 4096, 4096, 4096, 1024] + - [0, 18613.0] + - - [4096, 1024, 1, 3581, 4096, 4096, 4096, 1024] + - [33, 18645.0] + - - [1024, 33708, 1, 3906, 1024, 1024, 1024, 33708] + - [4, 20873.0] + - - [1024, 4096, 1, 3464, 1024, 1024, 1024, 4096] + - [17, 18627.0] + - - [1024, 4096, 1, 3440, 1024, 1024, 1024, 4096] + - [3, 18610.0] + - - [4096, 1024, 1, 3143, 4096, 4096, 4096, 1024] + - [42, 18611.0] + - - [1024, 4096, 1, 3349, 1024, 1024, 1024, 4096] + - [42, 18608.0] + - - [4096, 1024, 1, 3416, 4096, 4096, 4096, 1024] + - [33, 18653.0] + - - [4096, 1024, 1, 3365, 4096, 4096, 4096, 1024] + - [33, 18632.0] + - - [1024, 4096, 1, 3470, 1024, 1024, 1024, 4096] + - [17, 18628.0] + - - [4096, 1024, 1, 3287, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3441, 1024, 1024, 1024, 4096] + - [0, 18614.0] + - - [4096, 1024, 1, 3224, 4096, 4096, 4096, 1024] + - [33, 18637.0] + - - [1024, 4096, 1, 3387, 1024, 1024, 1024, 4096] + - [0, 18630.0] + - - [1024, 4096, 1, 3547, 1024, 1024, 1024, 4096] + - [17, 18635.0] + - - [4096, 1024, 1, 3478, 4096, 4096, 4096, 1024] + - [17, 18629.0] + - - [4096, 1024, 1, 3548, 4096, 4096, 4096, 1024] + - [33, 18638.0] + - - [1024, 33708, 1, 4020, 1024, 1024, 1024, 33708] + - [4, 20880.0] + - - [4096, 1024, 1, 3320, 4096, 4096, 4096, 1024] + - [33, 18596.0] + - - [1024, 4096, 1, 3906, 1024, 1024, 1024, 4096] + - [9, 18606.0] + - - [4096, 1024, 1, 3796, 4096, 4096, 4096, 1024] + - [33, 18615.0] + - - [1024, 4096, 1, 3306, 1024, 1024, 1024, 4096] + - [17, 18602.0] + - - [1024, 4096, 1, 3401, 1024, 1024, 1024, 4096] + - [17, 18599.0] + - - [1024, 4096, 1, 3215, 1024, 1024, 1024, 4096] + - [33, 18637.0] + - - [4096, 1024, 1, 4012, 4096, 4096, 4096, 1024] + - [0, 18610.0] + - - [1024, 4096, 1, 2765, 1024, 1024, 1024, 4096] + - [17, 18609.0] + - - [4096, 1024, 1, 3554, 4096, 4096, 4096, 1024] + - [33, 18602.0] + - - [4096, 1024, 1, 3423, 4096, 4096, 4096, 1024] + - [24, 18606.0] + - - [1024, 4096, 1, 3562, 1024, 1024, 1024, 4096] + - [33, 18597.0] + - - [1024, 4096, 1, 3489, 1024, 1024, 1024, 4096] + - [17, 18633.0] + - - [4096, 1024, 1, 3358, 4096, 4096, 4096, 1024] + - [42, 18621.0] + - - [4096, 1024, 1, 3270, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [1024, 4096, 1, 3293, 1024, 1024, 1024, 4096] + - [17, 18617.0] + - - [1024, 4096, 1, 3376, 1024, 1024, 1024, 4096] + - [33, 18603.0] + - - [4096, 1024, 1, 3245, 4096, 4096, 4096, 1024] + - [42, 18607.0] + - - [4096, 1024, 1, 3541, 4096, 4096, 4096, 1024] + - [33, 18628.0] + - - [4096, 1024, 1, 3443, 4096, 4096, 4096, 1024] + - [17, 18626.0] + - - [4096, 1024, 1, 3438, 4096, 4096, 4096, 1024] + - [17, 18603.0] + - - [4096, 1024, 1, 3244, 4096, 4096, 4096, 1024] + - [17, 18630.0] + - - [1024, 4096, 1, 3365, 1024, 1024, 1024, 4096] + - [33, 18630.0] + - - [1024, 4096, 1, 3299, 1024, 1024, 1024, 4096] + - [17, 18597.0] + - - [1024, 4096, 1, 3471, 1024, 1024, 1024, 4096] + - [33, 18599.0] + - - [1024, 4096, 1, 3398, 1024, 1024, 1024, 4096] + - [33, 18644.0] + - - [4096, 1024, 1, 3162, 4096, 4096, 4096, 1024] + - [33, 18592.0] + - - [1024, 4096, 1, 4005, 1024, 1024, 1024, 4096] + - [33, 18602.0] + - - [4096, 1024, 1, 3579, 4096, 4096, 4096, 1024] + - [24, 18620.0] + - - [1024, 4096, 1, 3121, 1024, 1024, 1024, 4096] + - [33, 18589.0] + - - [4096, 1024, 1, 3441, 4096, 4096, 4096, 1024] + - [33, 18600.0] + - - [4096, 1024, 1, 3422, 4096, 4096, 4096, 1024] + - [24, 18615.0] + - - [4096, 1024, 1, 3444, 4096, 4096, 4096, 1024] + - [24, 18614.0] + - - [1024, 4096, 1, 3337, 1024, 1024, 1024, 4096] + - [24, 18598.0] + - - [4096, 1024, 1, 3550, 4096, 4096, 4096, 1024] + - [42, 18609.0] + - - [1024, 4096, 1, 3477, 1024, 1024, 1024, 4096] + - [42, 18612.0] + - - [4096, 1024, 1, 3490, 4096, 4096, 4096, 1024] + - [24, 18617.0] + - - [4096, 1024, 1, 3585, 4096, 4096, 4096, 1024] + - [0, 18616.0] + - - [1024, 4096, 1, 3143, 1024, 1024, 1024, 4096] + - [33, 18635.0] + - - [1024, 33708, 1, 3876, 1024, 1024, 1024, 33708] + - [4, 20879.0] + - - [1024, 4096, 1, 3320, 1024, 1024, 1024, 4096] + - [33, 18595.0] + - - [1024, 4096, 1, 3423, 1024, 1024, 1024, 4096] + - [33, 18633.0] + - - [1024, 4096, 1, 3894, 1024, 1024, 1024, 4096] + - [17, 18637.0] + - - [4096, 1024, 1, 3410, 4096, 4096, 4096, 1024] + - [33, 18623.0] + - - [1024, 4096, 1, 3561, 1024, 1024, 1024, 4096] + - [33, 18597.0] + - - [4096, 1024, 1, 3492, 4096, 4096, 4096, 1024] + - [17, 18592.0] + - - [36548, 1024, 1, 3712, 36548, 36548, 36548, 1024] + - [4, 20844.0] + - - [4096, 2048, 1, 128, 4096, 4096, 4096, 2048] + - [17, 18506.0] + - - [4096, 3072, 1, 128, 4096, 4096, 4096, 3072] + - [33, 19188.0] + - - [768, 3072, 1, 4096, 768, 768, 768, 3072] + - [5, 19946.0] + - - [768, 30522, 1, 1280, 768, 768, 768, 30522] + - [37, 20670.0] + - - [768, 30522, 1, 320, 768, 768, 768, 30522] + - [34, 20228.0] + - - [768, 30522, 1, 640, 768, 768, 768, 30522] + - [37, 20506.0] + - - [256, 512, 36, 98, 256, 256, 256, 512] + - [33, 17358.0] + - - [256, 256, 64, 56, 256, 256, 256, 256] + - [9, 15534.0] + - - [512, 486, 36, 800, 512, 512, 512, 486] + - [1, 19191.0] + - - [512, 512, 36, 1568, 512, 512, 512, 512] + - [21, 20527.0] + - - [256, 384, 36, 4096, 256, 256, 256, 384] + - [22, 20040.0] + - - [128, 256, 64, 32, 128, 128, 128, 256] + - [19, 7661.0] + - - [128, 256, 64, 9, 128, 128, 128, 256] + - [36, 2904.0] + - - [256, 512, 36, 784, 256, 256, 256, 512] + - [4, 19977.0] + - - [256, 324, 36, 32, 256, 256, 256, 324] + - [0, 9691.0] + - - [512, 512, 36, 33, 512, 512, 512, 512] + - [33, 13671.0] + - - [192, 384, 64, 128, 192, 192, 192, 384] + - [0, 13182.0] + - - [512, 512, 64, 72, 512, 512, 512, 512] + - [9, 18993.0] + - - [512, 512, 36, 128, 512, 512, 512, 512] + - [22, 19309.0] + - - [192, 384, 64, 2304, 192, 192, 192, 384] + - [37, 14765.0] + - - [384, 256, 64, 450, 384, 384, 384, 256] + - [1, 19179.0] + - - [384, 256, 64, 2304, 384, 384, 384, 256] + - [37, 19761.0] + - - [512, 512, 64, 144, 512, 512, 512, 512] + - [33, 19399.0] + - - [256, 256, 36, 6272, 256, 256, 256, 256] + - [38, 19884.0] + - - [256, 384, 64, 2304, 256, 256, 256, 384] + - [21, 19761.0] + - - [512, 512, 36, 66, 512, 512, 512, 512] + - [33, 17273.0] + - - [128, 256, 64, 800, 128, 128, 128, 256] + - [22, 17397.0] + - - [192, 256, 36, 512, 192, 192, 192, 256] + - [38, 14160.0] + - - [256, 512, 64, 200, 256, 256, 256, 512] + - [0, 19026.0] + - - [256, 512, 64, 25, 256, 256, 256, 512] + - [35, 10744.0] + - - [128, 256, 36, 1568, 128, 128, 128, 256] + - [35, 18744.0] + - - [128, 256, 64, 288, 128, 128, 128, 256] + - [5, 16740.0] + - - [256, 384, 64, 1152, 256, 256, 256, 384] + - [4, 19583.0] + - - [160, 320, 64, 288, 160, 160, 160, 320] + - [3, 11335.0] + - - [128, 256, 36, 128, 128, 128, 128, 256] + - [0, 14435.0] + - - [512, 512, 36, 16, 512, 512, 512, 512] + - [9, 7634.0] + - - [384, 256, 36, 800, 384, 384, 384, 256] + - [22, 19845.0] + - - [192, 384, 36, 4096, 192, 192, 192, 384] + - [5, 15022.0] + - - [256, 384, 64, 576, 256, 256, 256, 384] + - [38, 19418.0] + - - [512, 512, 64, 14, 512, 512, 512, 512] + - [3, 6848.0] + - - [512, 512, 36, 8, 512, 512, 512, 512] + - [33, 3936.0] + - - [512, 486, 64, 128, 512, 512, 512, 486] + - [0, 18090.0] + - - [256, 256, 36, 128, 256, 256, 256, 256] + - [0, 17100.0] + - - [256, 256, 36, 32, 256, 256, 256, 256] + - [0, 10066.0] + - - [192, 256, 64, 288, 192, 192, 192, 256] + - [0, 13450.0] + - - [256, 256, 36, 16, 256, 256, 256, 256] + - [0, 6011.0] + - - [128, 256, 36, 3200, 128, 128, 128, 256] + - [30, 18809.0] + - - [160, 320, 64, 512, 160, 160, 160, 320] + - [17, 11926.0] + - - [160, 320, 36, 512, 160, 160, 160, 320] + - [36, 11631.0] + - - [256, 512, 36, 4, 256, 256, 256, 512] + - [45, 1958.0] + - - [256, 324, 64, 1568, 256, 256, 256, 324] + - [21, 16521.0] + - - [256, 256, 36, 3200, 256, 256, 256, 256] + - [5, 19747.0] + - - [256, 256, 36, 210, 256, 256, 256, 256] + - [17, 17299.0] + - - [192, 384, 64, 576, 192, 192, 192, 384] + - [1, 14547.0] + - - [512, 512, 64, 800, 512, 512, 512, 512] + - [4, 20281.0] + - - [256, 256, 64, 1152, 256, 256, 256, 256] + - [20, 18325.0] + - - [512, 486, 64, 512, 512, 512, 512, 486] + - [1, 18891.0] + - - [256, 512, 64, 1600, 256, 256, 256, 512] + - [21, 19441.0] + - - [512, 512, 64, 9, 512, 512, 512, 512] + - [3, 4518.0] + - - [256, 512, 36, 1568, 256, 256, 256, 512] + - [5, 20048.0] + - - [128, 256, 64, 3200, 128, 128, 128, 256] + - [12, 17154.0] + - - [256, 512, 64, 4, 256, 256, 256, 512] + - [33, 2110.0] + - - [256, 256, 64, 450, 256, 256, 256, 256] + - [33, 18254.0] + - - [256, 256, 64, 72, 256, 256, 256, 256] + - [0, 16648.0] + - - [128, 256, 36, 3136, 128, 128, 128, 256] + - [38, 18780.0] + - - [160, 320, 64, 242, 160, 160, 160, 320] + - [3, 11103.0] + - - [512, 512, 36, 512, 512, 512, 512, 512] + - [4, 20331.0] + - - [512, 512, 36, 256, 512, 512, 512, 512] + - [1, 19914.0] + - - [512, 512, 36, 1024, 512, 512, 512, 512] + - [4, 20447.0] + - - [256, 256, 36, 4096, 256, 256, 256, 256] + - [38, 19838.0] + - - [256, 256, 64, 896, 256, 256, 256, 256] + - [17, 18282.0] + - - [128, 256, 64, 242, 128, 128, 128, 256] + - [47, 16030.0] + - - [192, 384, 36, 1024, 192, 192, 192, 384] + - [5, 14792.0] + - - [128, 256, 64, 100, 128, 128, 128, 256] + - [17, 14604.0] + - - [384, 256, 64, 1152, 384, 384, 384, 256] + - [21, 19600.0] + - - [192, 384, 36, 128, 192, 192, 192, 384] + - [9, 12030.0] + - - [128, 256, 64, 1568, 128, 128, 128, 256] + - [22, 17525.0] + - - [128, 256, 64, 72, 128, 128, 128, 256] + - [9, 13222.0] + - - [256, 256, 36, 12544, 256, 256, 256, 256] + - [6, 20028.0] + - - [256, 256, 36, 105, 256, 256, 256, 256] + - [9, 15639.0] + - - [128, 256, 36, 392, 128, 128, 128, 256] + - [35, 17255.0] + - - [384, 256, 36, 1024, 384, 384, 384, 256] + - [5, 19900.0] + - - [128, 256, 64, 1152, 128, 128, 128, 256] + - [38, 17456.0] + - - [256, 324, 64, 32, 256, 256, 256, 324] + - [9, 10684.0] + - - [256, 384, 36, 800, 256, 256, 256, 384] + - [38, 19768.0] + - - [512, 512, 64, 4, 512, 512, 512, 512] + - [9, 2075.0] + - - [192, 320, 36, 128, 192, 192, 192, 320] + - [9, 12245.0] + - - [192, 384, 64, 242, 192, 192, 192, 384] + - [38, 13939.0] + - - [256, 486, 64, 32, 256, 256, 256, 486] + - [33, 10283.0] + - - [512, 512, 64, 64, 512, 512, 512, 512] + - [0, 18944.0] + - - [128, 256, 36, 512, 128, 128, 128, 256] + - [5, 17743.0] + - - [512, 512, 64, 576, 512, 512, 512, 512] + - [4, 20185.0] + - - [256, 256, 64, 9, 256, 256, 256, 256] + - [3, 3651.0] + - - [128, 256, 36, 12544, 128, 128, 128, 256] + - [35, 18530.0] + - - [256, 512, 36, 3136, 256, 256, 256, 512] + - [4, 20215.0] + - - [144, 288, 36, 512, 144, 144, 144, 288] + - [3, 9263.0] + - - [384, 384, 36, 800, 384, 384, 384, 384] + - [5, 20079.0] + - - [512, 512, 64, 1600, 512, 512, 512, 512] + - [37, 20436.0] + - - [512, 512, 36, 4, 512, 512, 512, 512] + - [3, 1952.0] + - - [192, 384, 64, 450, 192, 192, 192, 384] + - [38, 14330.0] + - - [256, 256, 36, 1024, 256, 256, 256, 256] + - [38, 19508.0] + - - [256, 512, 64, 400, 256, 256, 256, 512] + - [33, 19240.0] + - - [128, 256, 36, 6272, 128, 128, 128, 256] + - [2, 18244.0] + - - [256, 256, 36, 512, 256, 256, 256, 256] + - [5, 18775.0] + - - [256, 256, 64, 112, 256, 256, 256, 256] + - [33, 17411.0] + - - [512, 512, 64, 18, 512, 512, 512, 512] + - [13, 8473.0] + - - [256, 256, 64, 18, 256, 256, 256, 256] + - [0, 6765.0] + - - [256, 256, 64, 1568, 256, 256, 256, 256] + - [3, 18409.0] + - - [384, 256, 36, 4096, 384, 384, 384, 256] + - [5, 20076.0] + - - [256, 512, 64, 800, 256, 256, 256, 512] + - [20, 19256.0] + - - [256, 384, 36, 2048, 256, 256, 256, 384] + - [47, 19890.0] + - - [384, 384, 64, 2304, 384, 384, 384, 384] + - [37, 20583.0] + - - [160, 320, 64, 128, 160, 160, 160, 320] + - [17, 10613.0] + - - [512, 512, 36, 528, 512, 512, 512, 512] + - [1, 20346.0] + - - [160, 320, 36, 128, 160, 160, 160, 320] + - [0, 10666.0] + - - [256, 512, 36, 49, 256, 256, 256, 512] + - [9, 15072.0] + - - [384, 384, 64, 450, 384, 384, 384, 384] + - [1, 20167.0] + - - [256, 256, 64, 3200, 256, 256, 256, 256] + - [3, 18547.0] + - - [512, 512, 64, 8, 512, 512, 512, 512] + - [9, 4048.0] + - - [512, 512, 64, 288, 512, 512, 512, 512] + - [1, 19850.0] + - - [384, 384, 36, 1024, 384, 384, 384, 384] + - [5, 20078.0] + - - [128, 256, 36, 16, 128, 128, 128, 256] + - [17, 3932.0] + - - [256, 256, 64, 288, 256, 256, 256, 256] + - [17, 18132.0] + - - [256, 384, 36, 1024, 256, 256, 256, 384] + - [22, 19879.0] + - - [256, 324, 36, 3200, 256, 256, 256, 324] + - [5, 16853.0] + - - [192, 384, 64, 512, 192, 192, 192, 384] + - [22, 14392.0] + - - [128, 256, 64, 1600, 128, 128, 128, 256] + - [22, 17520.0] + - - [512, 512, 36, 32, 512, 512, 512, 512] + - [9, 13187.0] + - - [512, 512, 36, 3136, 512, 512, 512, 512] + - [21, 20666.0] + - - [128, 256, 64, 6400, 128, 128, 128, 256] + - [27, 17312.0] + - - [256, 256, 36, 2048, 256, 256, 256, 256] + - [5, 19719.0] + - - [256, 256, 64, 6400, 256, 256, 256, 256] + - [36, 18573.0] + - - [256, 256, 36, 1680, 256, 256, 256, 256] + - [22, 19694.0] + - - [192, 384, 36, 2048, 192, 192, 192, 384] + - [5, 14914.0] + - - [256, 256, 64, 144, 256, 256, 256, 256] + - [33, 17466.0] + - - [384, 384, 36, 4096, 384, 384, 384, 384] + - [38, 20278.0] + - - [160, 320, 64, 1152, 160, 160, 160, 320] + - [3, 11993.0] + - - [384, 256, 36, 2048, 384, 384, 384, 256] + - [38, 19945.0] + - - [256, 512, 36, 392, 256, 256, 256, 512] + - [22, 19381.0] + - - [256, 512, 64, 50, 256, 256, 256, 512] + - [0, 16346.0] + - - [384, 384, 36, 2048, 384, 384, 384, 384] + - [5, 20191.0] + - - [256, 384, 64, 450, 256, 256, 256, 384] + - [1, 19184.0] + - - [192, 320, 64, 128, 192, 192, 192, 320] + - [17, 12866.0] + - - [128, 256, 36, 32, 128, 128, 128, 256] + - [19, 7316.0] + - - [512, 512, 64, 256, 512, 512, 512, 512] + - [1, 19945.0] + - - [256, 512, 64, 32, 256, 256, 256, 512] + - [33, 13184.0] + - - [384, 384, 64, 576, 384, 384, 384, 384] + - [4, 20206.0] + - - [512, 486, 36, 288, 512, 512, 512, 486] + - [22, 18271.0] + - - [144, 288, 64, 242, 144, 144, 144, 288] + - [13, 9132.0] + - - [384, 256, 64, 576, 384, 384, 384, 256] + - [18, 19563.0] + - - [512, 512, 36, 64, 512, 512, 512, 512] + - [33, 18269.0] + - - [448, 384, 64, 128, 448, 448, 448, 384] + - [17, 16429.0] + - - [144, 288, 64, 288, 144, 144, 144, 288] + - [13, 9320.0] + - - [512, 512, 64, 224, 512, 512, 512, 512] + - [10, 19897.0] + - - [384, 384, 64, 1152, 384, 384, 384, 384] + - [21, 20465.0] + - - [448, 384, 36, 128, 448, 448, 448, 384] + - [33, 15611.0] + - - [256, 486, 36, 128, 256, 256, 256, 486] + - [0, 16503.0] + - - [256, 256, 36, 800, 256, 256, 256, 256] + - [5, 19434.0] + - - [192, 384, 36, 800, 192, 192, 192, 384] + - [5, 14727.0] + - - [256, 256, 36, 256, 256, 256, 256, 256] + - [0, 18127.0] + - - [192, 384, 64, 1152, 192, 192, 192, 384] + - [5, 14617.0] + - - [128, 256, 64, 200, 128, 128, 128, 256] + - [33, 15780.0] + - - [512, 512, 64, 28, 512, 512, 512, 512] + - [26, 12447.0] + - - [144, 288, 64, 1152, 144, 144, 144, 288] + - [3, 9720.0] + - - [256, 256, 64, 576, 256, 256, 256, 256] + - [33, 18327.0] + - - [256, 256, 64, 2304, 256, 256, 256, 256] + - [36, 18495.0] + - - [192, 384, 36, 512, 192, 192, 192, 384] + - [5, 14350.0] + - - [256, 512, 36, 32, 256, 256, 256, 512] + - [33, 12731.0] + - - [512, 512, 64, 128, 512, 512, 512, 512] + - [1, 19469.0] + - - [512, 512, 64, 32, 512, 512, 512, 512] + - [9, 15252.0] + - - [128, 256, 36, 196, 128, 128, 128, 256] + - [0, 15435.0] + - - [196, 528, 32, 32, 196, 196, 196, 528] + - [33, 6353.0] + - - [196, 512, 32, 24, 196, 196, 196, 512] + - [0, 5070.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [42, 12402.0] + - - [1001, 1536, 1, 32, 1001, 1001, 1001, 1536] + - [19, 6853.0] + - - [196, 480, 32, 64, 196, 196, 196, 480] + - [33, 9263.0] + - - [289, 1024, 32, 384, 289, 289, 289, 1024] + - [1, 14734.0] + - - [784, 192, 32, 96, 784, 784, 784, 192] + - [9, 15033.0] + - - [50176, 256, 1, 128, 50176, 50176, 50176, 256] + - [38, 19262.0] + - - [289, 1024, 32, 256, 289, 289, 289, 1024] + - [33, 14627.0] + - - [289, 1024, 32, 192, 289, 289, 289, 1024] + - [33, 14541.0] + - - [12544, 512, 1, 256, 12544, 12544, 12544, 512] + - [38, 19527.0] + - - [1225, 1728, 1, 192, 1225, 1225, 1225, 1728] + - [33, 15802.0] + - - [196, 480, 32, 96, 196, 196, 196, 480] + - [0, 10816.0] + - - [196, 512, 32, 144, 196, 196, 196, 512] + - [0, 13041.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [17, 13935.0] + - - [5329, 576, 1, 96, 5329, 5329, 5329, 576] + - [9, 16191.0] + - - [196, 528, 32, 128, 196, 196, 196, 528] + - [9, 11914.0] + - - [5329, 448, 1, 64, 5329, 5329, 5329, 448] + - [9, 12754.0] + - - [784, 256, 32, 64, 784, 784, 784, 256] + - [9, 15224.0] + - - [784, 192, 32, 32, 784, 784, 784, 192] + - [0, 11731.0] + - - [21609, 288, 1, 32, 21609, 21609, 21609, 288] + - [33, 11315.0] + - - [784, 256, 32, 32, 784, 784, 784, 256] + - [24, 12718.0] + - - [5041, 720, 1, 192, 5041, 5041, 5041, 720] + - [0, 16343.0] + - - [196, 512, 32, 128, 196, 196, 196, 512] + - [0, 12687.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [33, 14054.0] + - - [1001, 4096, 1, 512, 1001, 1001, 1001, 4096] + - [0, 17948.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [0, 16429.0] + - - [784, 192, 32, 16, 784, 784, 784, 192] + - [17, 6644.0] + - - [3136, 1024, 1, 2048, 3136, 3136, 3136, 1024] + - [22, 18201.0] + - - [784, 256, 32, 128, 784, 784, 784, 256] + - [5, 16013.0] + - - [196, 512, 32, 32, 196, 196, 196, 512] + - [33, 6439.0] + - - [1225, 384, 32, 96, 1225, 1225, 1225, 384] + - [0, 18163.0] + - - [5041, 576, 1, 96, 5041, 5041, 5041, 576] + - [17, 15838.0] + - - [5329, 160, 32, 64, 5329, 5329, 5329, 160] + - [0, 13447.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [9, 13367.0] + - - [4096, 9216, 1, 512, 4096, 4096, 4096, 9216] + - [21, 20721.0] + - - [196, 480, 32, 192, 196, 196, 196, 480] + - [33, 12436.0] + - - [3136, 1024, 1, 512, 3136, 3136, 3136, 1024] + - [5, 17604.0] + - - [784, 192, 32, 64, 784, 784, 784, 192] + - [17, 14638.0] + - - [289, 1024, 32, 128, 289, 289, 289, 1024] + - [0, 14167.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [33, 14120.0] + - - [196, 512, 32, 112, 196, 196, 196, 512] + - [0, 12567.0] + - - [1001, 2048, 1, 32, 1001, 1001, 1001, 2048] + - [33, 7885.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [9, 14941.0] + - - [1225, 384, 32, 192, 1225, 1225, 1225, 384] + - [38, 18777.0] + - - [50176, 256, 1, 512, 50176, 50176, 50176, 256] + - [4, 20134.0] + - - [196, 512, 32, 160, 196, 196, 196, 512] + - [9, 12923.0] + - - [4096, 4096, 1, 512, 4096, 4096, 4096, 4096] + - [37, 20256.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [9, 16937.0] + - - [196, 480, 32, 16, 196, 196, 196, 480] + - [33, 3431.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [0, 15845.0] + - - [1225, 1200, 1, 64, 1225, 1225, 1225, 1200] + - [33, 10453.0] + - - [1225, 384, 32, 64, 1225, 1225, 1225, 384] + - [0, 17503.0] + - - [12544, 512, 1, 1024, 12544, 12544, 12544, 512] + - [1, 20088.0] + - - [196, 512, 32, 64, 196, 196, 196, 512] + - [33, 9843.0] + - - [196, 528, 32, 256, 196, 196, 196, 528] + - [0, 12706.0] + - - [196, 528, 32, 160, 196, 196, 196, 528] + - [0, 12260.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [0, 15465.0] + - - [1001, 2048, 1, 64, 1001, 1001, 1001, 2048] + - [33, 11993.0] + - - [289, 768, 128, 128, 289, 289, 289, 768] + - [0, 14649.0] + - - [1225, 192, 128, 64, 1225, 1225, 1225, 192] + - [40, 8201.0] + - - [1225, 288, 128, 48, 1225, 1225, 1225, 288] + - [16, 6826.0] + - - [289, 768, 128, 192, 289, 289, 289, 768] + - [17, 14734.0] + - - [289, 768, 128, 160, 289, 289, 289, 768] + - [0, 14467.0] + - - [1225, 256, 128, 48, 1225, 1225, 1225, 256] + - [0, 6901.0] + - - [1225, 192, 128, 48, 1225, 1225, 1225, 192] + - [40, 7408.0] + - - [1225, 288, 128, 64, 1225, 1225, 1225, 288] + - [8, 8433.0] + - - [1225, 256, 128, 64, 1225, 1225, 1225, 256] + - [7, 7484.0] + - - [1001, 2048, 1, 128, 1001, 1001, 1001, 2048] + - [17, 14230.0] + - - [1225, 192, 128, 32, 1225, 1225, 1225, 192] + - [45, 7979.0] + - - [1001, 1536, 1, 64, 1001, 1001, 1001, 1536] + - [26, 9498.0] + - - [1024, 4096, 1, 64, 1024, 1024, 1024, 4096] + - [9, 16151.0] + - - [1024, 4096, 1, 6336, 1024, 1024, 1024, 4096] + - [33, 18654.0] + - - [512, 33708, 1, 3780, 512, 512, 512, 33708] + - [4, 20383.0] + - - [512, 33708, 1, 3968, 512, 512, 512, 33708] + - [4, 20394.0] + - - [512, 33708, 1, 4030, 512, 512, 512, 33708] + - [37, 20379.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [17, 14086.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [17, 14886.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [17, 14378.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [17, 14508.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [17, 14541.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [9, 16841.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [0, 17091.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [9, 12811.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [9, 16679.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [17, 17497.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [9, 16949.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [9, 17801.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [33, 14548.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [0, 15595.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 13888.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [21, 20128.0] + - - [512, 33708, 1, 3796, 512, 512, 512, 33708] + - [21, 20344.0] + - - [512, 33708, 1, 3822, 512, 512, 512, 33708] + - [21, 20360.0] + - - [512, 33708, 1, 3840, 512, 512, 512, 33708] + - [37, 20351.0] + - - [512, 33708, 1, 3859, 512, 512, 512, 33708] + - [21, 20363.0] + - - [512, 33708, 1, 3870, 512, 512, 512, 33708] + - [4, 20338.0] + - - [512, 33708, 1, 3876, 512, 512, 512, 33708] + - [4, 20343.0] + - - [512, 33708, 1, 3906, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 3910, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3925, 512, 512, 512, 33708] + - [21, 20345.0] + - - [512, 33708, 1, 3942, 512, 512, 512, 33708] + - [4, 20341.0] + - - [512, 33708, 1, 3944, 512, 512, 512, 33708] + - [4, 20347.0] + - - [512, 33708, 1, 3955, 512, 512, 512, 33708] + - [4, 20344.0] + - - [512, 33708, 1, 3969, 512, 512, 512, 33708] + - [4, 20347.0] + - - [512, 33708, 1, 3976, 512, 512, 512, 33708] + - [4, 20340.0] + - - [512, 33708, 1, 3977, 512, 512, 512, 33708] + - [4, 20344.0] + - - [512, 33708, 1, 3978, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3990, 512, 512, 512, 33708] + - [4, 20346.0] + - - [512, 33708, 1, 3995, 512, 512, 512, 33708] + - [21, 20346.0] + - - [512, 33708, 1, 3996, 512, 512, 512, 33708] + - [4, 20343.0] + - - [512, 33708, 1, 3999, 512, 512, 512, 33708] + - [4, 20342.0] + - - [512, 33708, 1, 4005, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 4012, 512, 512, 512, 33708] + - [21, 20342.0] + - - [512, 33708, 1, 4020, 512, 512, 512, 33708] + - [4, 20345.0] + - - [512, 33708, 1, 4026, 512, 512, 512, 33708] + - [37, 20339.0] + - - [512, 33708, 1, 4032, 512, 512, 512, 33708] + - [4, 20351.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [17, 18739.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [33, 18832.0] + - - [1024, 30522, 1, 20, 1024, 1024, 1024, 30522] + - [3, 10151.0] + - - [1024, 30522, 1, 80, 1024, 1024, 1024, 30522] + - [9, 18774.0] + - - [1024, 30522, 1, 120, 1024, 1024, 1024, 30522] + - [9, 19163.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [17, 18606.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [33, 18608.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [33, 18624.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [17, 18613.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [33, 18638.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [37, 20839.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [21, 20841.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [37, 20847.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [0, 18599.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [17, 18596.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [36, 18591.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [3, 18606.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [20, 18605.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [36, 18600.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [4, 20775.0] + - - [7744, 7744, 1, 7744, 7744, 7744, 7744, 7744] + - [4, 20634.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 1152] + - [17, 15546.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 1536] + - [38, 19113.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 1920] + - [33, 18365.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 2304] + - [22, 19741.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 2688] + - [17, 19106.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 3072] + - [10, 20137.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 3456] + - [37, 19717.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 3840] + - [38, 20201.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 4224] + - [4, 20092.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 4608] + - [4, 20574.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 4992] + - [37, 20335.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 5376] + - [38, 20308.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 5760] + - [34, 20427.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 6144] + - [4, 20562.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 6528] + - [21, 20539.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 6912] + - [37, 20530.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 7296] + - [21, 20627.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 7680] + - [37, 20716.0] + - - [1536, 768, 1, 384, 1536, 1536, 1536, 768] + - [19, 16581.0] + - - [1920, 960, 1, 384, 1920, 1920, 1920, 960] + - [17, 16560.0] + - - [2304, 1152, 1, 384, 2304, 2304, 2304, 1152] + - [5, 17393.0] + - - [2688, 1344, 1, 384, 2688, 2688, 2688, 1344] + - [17, 18049.0] + - - [3072, 1536, 1, 384, 3072, 3072, 3072, 1536] + - [47, 19648.0] + - - [3456, 1728, 1, 384, 3456, 3456, 3456, 1728] + - [33, 18766.0] + - - [3840, 1920, 1, 384, 3840, 3840, 3840, 1920] + - [34, 19368.0] + - - [4224, 2112, 1, 384, 4224, 4224, 4224, 2112] + - [17, 19134.0] + - - [4608, 2304, 1, 384, 4608, 4608, 4608, 2304] + - [4, 20217.0] + - - [4992, 2496, 1, 384, 4992, 4992, 4992, 2496] + - [1, 19501.0] + - - [5376, 2688, 1, 384, 5376, 5376, 5376, 2688] + - [37, 20065.0] + - - [5760, 2880, 1, 384, 5760, 5760, 5760, 2880] + - [18, 19850.0] + - - [6144, 3072, 1, 384, 6144, 6144, 6144, 3072] + - [37, 20503.0] + - - [6528, 3264, 1, 384, 6528, 6528, 6528, 3264] + - [37, 20046.0] + - - [6912, 3456, 1, 384, 6912, 6912, 6912, 3456] + - [4, 20397.0] + - - [7296, 3648, 1, 384, 7296, 7296, 7296, 3648] + - [18, 20175.0] + - - [7680, 3840, 1, 384, 7680, 7680, 7680, 3840] + - [37, 20711.0] + - - [768, 1536, 1, 384, 768, 768, 768, 1536] + - [35, 17422.0] + - - [1152, 2304, 1, 384, 1152, 1152, 1152, 2304] + - [5, 17488.0] + - - [1536, 3072, 1, 384, 1536, 1536, 1536, 3072] + - [38, 19742.0] + - - [1920, 3840, 1, 384, 1920, 1920, 1920, 3840] + - [18, 19391.0] + - - [2304, 4608, 1, 384, 2304, 2304, 2304, 4608] + - [4, 20265.0] + - - [2688, 5376, 1, 384, 2688, 2688, 2688, 5376] + - [18, 20111.0] + - - [3072, 6144, 1, 384, 3072, 3072, 3072, 6144] + - [14, 20531.0] + - - [3456, 6912, 1, 384, 3456, 3456, 3456, 6912] + - [4, 20442.0] + - - [3840, 7680, 1, 384, 3840, 3840, 3840, 7680] + - [37, 20583.0] + - - [4224, 8448, 1, 384, 4224, 4224, 4224, 8448] + - [4, 20521.0] + - - [4608, 9216, 1, 384, 4608, 4608, 4608, 9216] + - [37, 20628.0] + - - [4992, 9984, 1, 384, 4992, 4992, 4992, 9984] + - [37, 20664.0] + - - [5376, 10752, 1, 384, 5376, 5376, 5376, 10752] + - [37, 20729.0] + - - [5760, 11520, 1, 384, 5760, 5760, 5760, 11520] + - [21, 20738.0] + - - [6144, 12288, 1, 384, 6144, 6144, 6144, 12288] + - [21, 20737.0] + - - [6528, 13056, 1, 384, 6528, 6528, 6528, 13056] + - [21, 20785.0] + - - [6912, 13824, 1, 384, 6912, 6912, 6912, 13824] + - [37, 20819.0] + - - [7296, 14592, 1, 384, 7296, 7296, 7296, 14592] + - [4, 20831.0] + - - [7680, 15360, 1, 384, 7680, 7680, 7680, 15360] + - [37, 20840.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [33, 18438.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [3, 19032.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [22, 18342.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [38, 19203.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [22, 19592.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [22, 20060.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [33, 18127.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [20, 19261.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [23, 18793.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [23, 19521.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [39, 19908.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [36, 18736.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [36, 19395.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [5, 18970.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [22, 19946.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [5, 20247.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [20, 19485.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [23, 19324.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [13, 18781.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [21, 19673.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [37, 20201.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [33, 18927.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [3, 19138.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [23, 19642.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [31, 19491.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [37, 19490.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [4, 20099.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [3, 15776.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [3, 16570.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [3, 18169.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [22, 16891.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [38, 17606.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [33, 17103.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [23, 17851.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [23, 19448.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [28, 18124.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [3, 18616.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [33, 18985.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [37, 20367.0] + - - [512, 3280, 1, 1600, 512, 512, 512, 3280] + - [2, 18126.0] + - - [512, 3280, 1, 200, 512, 512, 512, 3280] + - [5, 15813.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [22, 16223.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [22, 17609.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [17, 18504.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] + - [33, 18700.0] + - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] + - [0, 18517.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [17, 18593.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [37, 20562.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [33, 18648.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [37, 20628.0] + - - [9216, 128, 1, 128, 9216, 9216, 9216, 128] + - [0, 12884.0] + - - [9600, 128, 1, 128, 9600, 9600, 9600, 128] + - [0, 13240.0] + - - [9984, 128, 1, 128, 9984, 9984, 9984, 128] + - [0, 13746.0] + - - [10368, 128, 1, 128, 10368, 10368, 10368, 128] + - [0, 14203.0] + - - [10752, 128, 1, 128, 10752, 10752, 10752, 128] + - [0, 14416.0] + - - [11136, 128, 1, 128, 11136, 11136, 11136, 128] + - [17, 14980.0] + - - [11520, 128, 1, 128, 11520, 11520, 11520, 128] + - [33, 13919.0] + - - [11904, 128, 1, 128, 11904, 11904, 11904, 128] + - [33, 14362.0] + - - [12288, 128, 1, 128, 12288, 12288, 12288, 128] + - [0, 14695.0] + - - [12672, 128, 1, 128, 12672, 12672, 12672, 128] + - [33, 15067.0] + - - [13056, 128, 1, 128, 13056, 13056, 13056, 128] + - [0, 15367.0] + - - [13440, 128, 1, 128, 13440, 13440, 13440, 128] + - [5, 15888.0] + - - [13824, 128, 1, 128, 13824, 13824, 13824, 128] + - [0, 15950.0] + - - [14208, 128, 1, 128, 14208, 14208, 14208, 128] + - [17, 15057.0] + - - [14592, 128, 1, 128, 14592, 14592, 14592, 128] + - [0, 15208.0] + - - [14976, 128, 1, 128, 14976, 14976, 14976, 128] + - [33, 15589.0] + - - [15360, 128, 1, 128, 15360, 15360, 15360, 128] + - [0, 15908.0] + - - [15744, 128, 1, 128, 15744, 15744, 15744, 128] + - [0, 16326.0] + - - [16128, 128, 1, 128, 16128, 16128, 16128, 128] + - [33, 15292.0] + - - [16512, 128, 1, 128, 16512, 16512, 16512, 128] + - [15, 15638.0] + - - [16896, 128, 1, 128, 16896, 16896, 16896, 128] + - [33, 15711.0] + - - [17280, 128, 1, 128, 17280, 17280, 17280, 128] + - [38, 16252.0] + - - [17664, 128, 1, 128, 17664, 17664, 17664, 128] + - [0, 16314.0] + - - [18048, 128, 1, 128, 18048, 18048, 18048, 128] + - [5, 17033.0] + - - [18432, 128, 1, 128, 18432, 18432, 18432, 128] + - [0, 16890.0] + - - [18816, 128, 1, 128, 18816, 18816, 18816, 128] + - [0, 15842.0] + - - [19200, 128, 1, 128, 19200, 19200, 19200, 128] + - [0, 15968.0] + - - [19584, 128, 1, 128, 19584, 19584, 19584, 128] + - [17, 16337.0] + - - [19968, 128, 1, 128, 19968, 19968, 19968, 128] + - [0, 16490.0] + - - [20352, 128, 1, 128, 20352, 20352, 20352, 128] + - [33, 16926.0] + - - [20736, 128, 1, 128, 20736, 20736, 20736, 128] + - [17, 16086.0] + - - [21120, 128, 1, 128, 21120, 21120, 21120, 128] + - [38, 16400.0] + - - [21504, 128, 1, 128, 21504, 21504, 21504, 128] + - [0, 16372.0] + - - [21888, 128, 1, 128, 21888, 21888, 21888, 128] + - [5, 16820.0] + - - [22272, 128, 1, 128, 22272, 22272, 22272, 128] + - [5, 16847.0] + - - [22656, 128, 1, 128, 22656, 22656, 22656, 128] + - [5, 17329.0] + - - [23040, 128, 1, 128, 23040, 23040, 23040, 128] + - [0, 17268.0] + - - [9216, 128, 1, 256, 9216, 9216, 9216, 128] + - [3, 16377.0] + - - [9600, 128, 1, 256, 9600, 9600, 9600, 128] + - [0, 14390.0] + - - [9984, 128, 1, 256, 9984, 9984, 9984, 128] + - [33, 14911.0] + - - [10368, 128, 1, 256, 10368, 10368, 10368, 128] + - [0, 15485.0] + - - [10752, 128, 1, 256, 10752, 10752, 10752, 128] + - [0, 15885.0] + - - [11136, 128, 1, 256, 11136, 11136, 11136, 128] + - [17, 16497.0] + - - [11520, 128, 1, 256, 11520, 11520, 11520, 128] + - [15, 14992.0] + - - [11904, 128, 1, 256, 11904, 11904, 11904, 128] + - [5, 15628.0] + - - [12288, 128, 1, 256, 12288, 12288, 12288, 128] + - [22, 15790.0] + - - [12672, 128, 1, 256, 12672, 12672, 12672, 128] + - [38, 16556.0] + - - [13056, 128, 1, 256, 13056, 13056, 13056, 128] + - [30, 16660.0] + - - [13440, 128, 1, 256, 13440, 13440, 13440, 128] + - [38, 17462.0] + - - [13824, 128, 1, 256, 13824, 13824, 13824, 128] + - [5, 17558.0] + - - [14208, 128, 1, 256, 14208, 14208, 14208, 128] + - [33, 15988.0] + - - [14592, 128, 1, 256, 14592, 14592, 14592, 128] + - [0, 16231.0] + - - [14976, 128, 1, 256, 14976, 14976, 14976, 128] + - [33, 16749.0] + - - [15360, 128, 1, 256, 15360, 15360, 15360, 128] + - [0, 17050.0] + - - [15744, 128, 1, 256, 15744, 15744, 15744, 128] + - [17, 17548.0] + - - [16128, 128, 1, 256, 16128, 16128, 16128, 128] + - [22, 16251.0] + - - [16512, 128, 1, 256, 16512, 16512, 16512, 128] + - [5, 16762.0] + - - [16896, 128, 1, 256, 16896, 16896, 16896, 128] + - [15, 16880.0] + - - [17280, 128, 1, 256, 17280, 17280, 17280, 128] + - [47, 17476.0] + - - [17664, 128, 1, 256, 17664, 17664, 17664, 128] + - [15, 17519.0] + - - [18048, 128, 1, 256, 18048, 18048, 18048, 128] + - [38, 18197.0] + - - [18432, 128, 1, 256, 18432, 18432, 18432, 128] + - [22, 18225.0] + - - [18816, 128, 1, 256, 18816, 18816, 18816, 128] + - [33, 16682.0] + - - [19200, 128, 1, 256, 19200, 19200, 19200, 128] + - [33, 16958.0] + - - [19584, 128, 1, 256, 19584, 19584, 19584, 128] + - [33, 17279.0] + - - [19968, 128, 1, 256, 19968, 19968, 19968, 128] + - [33, 17532.0] + - - [20352, 128, 1, 256, 20352, 20352, 20352, 128] + - [33, 17908.0] + - - [20736, 128, 1, 256, 20736, 20736, 20736, 128] + - [5, 17038.0] + - - [21120, 128, 1, 256, 21120, 21120, 21120, 128] + - [5, 17362.0] + - - [21504, 128, 1, 256, 21504, 21504, 21504, 128] + - [38, 17537.0] + - - [21888, 128, 1, 256, 21888, 21888, 21888, 128] + - [22, 17922.0] + - - [22272, 128, 1, 256, 22272, 22272, 22272, 128] + - [5, 18047.0] + - - [22656, 128, 1, 256, 22656, 22656, 22656, 128] + - [5, 18551.0] + - - [23040, 128, 1, 256, 23040, 23040, 23040, 128] + - [15, 18559.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 8064] + - [37, 20737.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 8448] + - [21, 20706.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 8832] + - [37, 20778.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 9216] + - [21, 20759.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 9600] + - [14, 20794.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 9984] + - [21, 20802.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 10368] + - [18, 20806.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 10752] + - [14, 20830.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 11136] + - [37, 20846.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 11520] + - [21, 20809.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 11904] + - [4, 20879.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 12288] + - [21, 20846.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 12672] + - [21, 20870.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 13056] + - [4, 20856.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 13440] + - [21, 20886.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 13824] + - [37, 20884.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 14208] + - [37, 20899.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 14592] + - [37, 20882.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 14976] + - [21, 20903.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 15360] + - [21, 20889.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 15744] + - [21, 20911.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 16128] + - [4, 20900.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 16512] + - [21, 20919.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 16896] + - [37, 20908.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 17280] + - [21, 20922.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 17664] + - [37, 20914.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 18048] + - [21, 20924.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 18432] + - [21, 20901.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 18816] + - [37, 20933.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 19200] + - [37, 20925.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 19584] + - [21, 20938.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 19968] + - [37, 20925.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 20352] + - [21, 20940.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 20736] + - [4, 20929.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 21120] + - [37, 20950.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 21504] + - [21, 20952.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 21888] + - [37, 20946.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 22272] + - [4, 20937.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 22656] + - [37, 20960.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 23040] + - [4, 20935.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [33, 16439.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [38, 19755.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [17, 18693.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [38, 20224.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [37, 19394.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [4, 20659.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [37, 20030.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [22, 20473.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [37, 20426.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [21, 20906.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [37, 20632.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [37, 20562.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [4, 21022.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [37, 20865.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [4, 20830.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [21, 20912.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [4, 21050.0] + - - [8064, 4032, 1, 384, 8064, 8064, 8064, 4032] + - [4, 20238.0] + - - [8448, 4224, 1, 384, 8448, 8448, 8448, 4224] + - [37, 20519.0] + - - [8832, 4416, 1, 384, 8832, 8832, 8832, 4416] + - [4, 20127.0] + - - [9216, 4608, 1, 384, 9216, 9216, 9216, 4608] + - [4, 20607.0] + - - [9600, 4800, 1, 384, 9600, 9600, 9600, 4800] + - [4, 20249.0] + - - [9984, 4992, 1, 384, 9984, 9984, 9984, 4992] + - [37, 20636.0] + - - [10368, 5184, 1, 384, 10368, 10368, 10368, 5184] + - [37, 20352.0] + - - [10752, 5376, 1, 384, 10752, 10752, 10752, 5376] + - [4, 20717.0] + - - [11136, 5568, 1, 384, 11136, 11136, 11136, 5568] + - [21, 20429.0] + - - [11520, 5760, 1, 384, 11520, 11520, 11520, 5760] + - [37, 20714.0] + - - [11904, 5952, 1, 384, 11904, 11904, 11904, 5952] + - [4, 20491.0] + - - [12288, 6144, 1, 384, 12288, 12288, 12288, 6144] + - [21, 20745.0] + - - [12672, 6336, 1, 384, 12672, 12672, 12672, 6336] + - [4, 20545.0] + - - [13056, 6528, 1, 384, 13056, 13056, 13056, 6528] + - [4, 20770.0] + - - [13440, 6720, 1, 384, 13440, 13440, 13440, 6720] + - [21, 20599.0] + - - [13824, 6912, 1, 384, 13824, 13824, 13824, 6912] + - [37, 20809.0] + - - [14208, 7104, 1, 384, 14208, 14208, 14208, 7104] + - [4, 20627.0] + - - [14592, 7296, 1, 384, 14592, 14592, 14592, 7296] + - [37, 20808.0] + - - [14976, 7488, 1, 384, 14976, 14976, 14976, 7488] + - [4, 20659.0] + - - [15360, 7680, 1, 384, 15360, 15360, 15360, 7680] + - [21, 20816.0] + - - [15744, 7872, 1, 384, 15744, 15744, 15744, 7872] + - [4, 20673.0] + - - [16128, 8064, 1, 384, 16128, 16128, 16128, 8064] + - [4, 20837.0] + - - [16512, 8256, 1, 384, 16512, 16512, 16512, 8256] + - [37, 20722.0] + - - [16896, 8448, 1, 384, 16896, 16896, 16896, 8448] + - [37, 20860.0] + - - [17280, 8640, 1, 384, 17280, 17280, 17280, 8640] + - [4, 20677.0] + - - [17664, 8832, 1, 384, 17664, 17664, 17664, 8832] + - [4, 20859.0] + - - [18048, 9024, 1, 384, 18048, 18048, 18048, 9024] + - [4, 20698.0] + - - [18432, 9216, 1, 384, 18432, 18432, 18432, 9216] + - [21, 20851.0] + - - [18816, 9408, 1, 384, 18816, 18816, 18816, 9408] + - [37, 20711.0] + - - [19200, 9600, 1, 384, 19200, 19200, 19200, 9600] + - [37, 20886.0] + - - [19584, 9792, 1, 384, 19584, 19584, 19584, 9792] + - [21, 20733.0] + - - [19968, 9984, 1, 384, 19968, 19968, 19968, 9984] + - [21, 20874.0] + - - [20352, 10176, 1, 384, 20352, 20352, 20352, 10176] + - [4, 20749.0] + - - [20736, 10368, 1, 384, 20736, 20736, 20736, 10368] + - [37, 20903.0] + - - [21120, 10560, 1, 384, 21120, 21120, 21120, 10560] + - [21, 20764.0] + - - [21504, 10752, 1, 384, 21504, 21504, 21504, 10752] + - [21, 20883.0] + - - [21888, 10944, 1, 384, 21888, 21888, 21888, 10944] + - [37, 20773.0] + - - [22272, 11136, 1, 384, 22272, 22272, 22272, 11136] + - [37, 20911.0] + - - [22656, 11328, 1, 384, 22656, 22656, 22656, 11328] + - [21, 20796.0] + - - [23040, 11520, 1, 384, 23040, 23040, 23040, 11520] + - [37, 20909.0] + - - [8064, 16128, 1, 384, 8064, 8064, 8064, 16128] + - [37, 20857.0] + - - [8448, 16896, 1, 384, 8448, 8448, 8448, 16896] + - [37, 20857.0] + - - [8832, 17664, 1, 384, 8832, 8832, 8832, 17664] + - [21, 20876.0] + - - [9216, 18432, 1, 384, 9216, 9216, 9216, 18432] + - [21, 20847.0] + - - [9600, 19200, 1, 384, 9600, 9600, 9600, 19200] + - [21, 20893.0] + - - [9984, 19968, 1, 384, 9984, 9984, 9984, 19968] + - [37, 20882.0] + - - [10368, 20736, 1, 384, 10368, 10368, 10368, 20736] + - [37, 20910.0] + - - [10752, 21504, 1, 384, 10752, 10752, 10752, 21504] + - [37, 20892.0] + - - [11136, 22272, 1, 384, 11136, 11136, 11136, 22272] + - [37, 20918.0] + - - [11520, 23040, 1, 384, 11520, 11520, 11520, 23040] + - [37, 20905.0] + - - [11904, 23808, 1, 384, 11904, 11904, 11904, 23808] + - [21, 20931.0] + - - [12288, 24576, 1, 384, 12288, 12288, 12288, 24576] + - [21, 20908.0] + - - [12672, 25344, 1, 384, 12672, 12672, 12672, 25344] + - [21, 20933.0] + - - [13056, 26112, 1, 384, 13056, 13056, 13056, 26112] + - [21, 20914.0] + - - [13440, 26880, 1, 384, 13440, 13440, 13440, 26880] + - [21, 20938.0] + - - [13824, 27648, 1, 384, 13824, 13824, 13824, 27648] + - [37, 20928.0] + - - [14208, 28416, 1, 384, 14208, 14208, 14208, 28416] + - [4, 20976.0] + - - [14592, 29184, 1, 384, 14592, 14592, 14592, 29184] + - [37, 20930.0] + - - [14976, 29952, 1, 384, 14976, 14976, 14976, 29952] + - [21, 20948.0] + - - [15360, 30720, 1, 384, 15360, 15360, 15360, 30720] + - [21, 20927.0] + - - [15744, 31488, 1, 384, 15744, 15744, 15744, 31488] + - [21, 20950.0] + - - [16128, 32256, 1, 384, 16128, 16128, 16128, 32256] + - [37, 20940.0] + - - [16512, 33024, 1, 384, 16512, 16512, 16512, 33024] + - [21, 20957.0] + - - [16896, 33792, 1, 384, 16896, 16896, 16896, 33792] + - [37, 20944.0] + - - [17280, 34560, 1, 384, 17280, 17280, 17280, 34560] + - [37, 20960.0] + - - [17664, 35328, 1, 384, 17664, 17664, 17664, 35328] + - [37, 20966.0] + - - [18048, 36096, 1, 384, 18048, 18048, 18048, 36096] + - [37, 20957.0] + - - [18432, 36864, 1, 384, 18432, 18432, 18432, 36864] + - [21, 20935.0] + - - [18816, 37632, 1, 384, 18816, 18816, 18816, 37632] + - [37, 20975.0] + - - [19200, 38400, 1, 384, 19200, 19200, 19200, 38400] + - [37, 20950.0] + - - [19584, 39168, 1, 384, 19584, 19584, 19584, 39168] + - [21, 20973.0] + - - [19968, 39936, 1, 384, 19968, 19968, 19968, 39936] + - [4, 20963.0] + - - [20352, 40704, 1, 384, 20352, 20352, 20352, 40704] + - [37, 20978.0] + - - [20736, 41472, 1, 384, 20736, 20736, 20736, 41472] + - [37, 20976.0] + - - [21120, 42240, 1, 384, 21120, 21120, 21120, 42240] + - [37, 20980.0] + - - [21504, 43008, 1, 384, 21504, 21504, 21504, 43008] + - [4, 20970.0] + - - [21888, 43776, 1, 384, 21888, 21888, 21888, 43776] + - [37, 20973.0] + - - [22272, 44544, 1, 384, 22272, 22272, 22272, 44544] + - [21, 20973.0] + - - [22656, 45312, 1, 384, 22656, 22656, 22656, 45312] + - [37, 20990.0] + - - [23040, 46080, 1, 384, 23040, 23040, 23040, 46080] + - [21, 20979.0] + - - [1152, 1536, 1, 384, 1152, 1152, 1152, 1536] + - [38, 18062.0] + - - [1920, 1536, 1, 384, 1920, 1920, 1920, 1536] + - [5, 19319.0] + - - [2304, 1536, 1, 384, 2304, 2304, 2304, 1536] + - [38, 19469.0] + - - [2688, 1536, 1, 384, 2688, 2688, 2688, 1536] + - [5, 19622.0] + - - [3456, 1536, 1, 384, 3456, 3456, 3456, 1536] + - [5, 19825.0] + - - [3840, 1536, 1, 384, 3840, 3840, 3840, 1536] + - [22, 19826.0] + - - [4224, 1536, 1, 384, 4224, 4224, 4224, 1536] + - [38, 19915.0] + - - [4608, 1536, 1, 384, 4608, 4608, 4608, 1536] + - [4, 19970.0] + - - [4992, 1536, 1, 384, 4992, 4992, 4992, 1536] + - [5, 19976.0] + - - [5376, 1536, 1, 384, 5376, 5376, 5376, 1536] + - [4, 20061.0] + - - [5760, 1536, 1, 384, 5760, 5760, 5760, 1536] + - [5, 20039.0] + - - [6144, 1536, 1, 384, 6144, 6144, 6144, 1536] + - [1, 20135.0] + - - [6528, 1536, 1, 384, 6528, 6528, 6528, 1536] + - [38, 20098.0] + - - [6912, 1536, 1, 384, 6912, 6912, 6912, 1536] + - [4, 20239.0] + - - [7296, 1536, 1, 384, 7296, 7296, 7296, 1536] + - [38, 20132.0] + - - [7680, 1536, 1, 384, 7680, 7680, 7680, 1536] + - [4, 20299.0] + - - [8064, 1536, 1, 384, 8064, 8064, 8064, 1536] + - [38, 20171.0] + - - [8448, 1536, 1, 384, 8448, 8448, 8448, 1536] + - [4, 20340.0] + - - [8832, 1536, 1, 384, 8832, 8832, 8832, 1536] + - [38, 20179.0] + - - [9216, 1536, 1, 384, 9216, 9216, 9216, 1536] + - [4, 20380.0] + - - [9600, 1536, 1, 384, 9600, 9600, 9600, 1536] + - [38, 20207.0] + - - [9984, 1536, 1, 384, 9984, 9984, 9984, 1536] + - [4, 20403.0] + - - [10368, 1536, 1, 384, 10368, 10368, 10368, 1536] + - [5, 20221.0] + - - [10752, 1536, 1, 384, 10752, 10752, 10752, 1536] + - [4, 20471.0] + - - [11136, 1536, 1, 384, 11136, 11136, 11136, 1536] + - [38, 20240.0] + - - [11520, 1536, 1, 384, 11520, 11520, 11520, 1536] + - [37, 20486.0] + - - [11904, 1536, 1, 384, 11904, 11904, 11904, 1536] + - [38, 20256.0] + - - [12288, 1536, 1, 384, 12288, 12288, 12288, 1536] + - [4, 20562.0] + - - [12672, 1536, 1, 384, 12672, 12672, 12672, 1536] + - [22, 20266.0] + - - [13056, 1536, 1, 384, 13056, 13056, 13056, 1536] + - [4, 20536.0] + - - [13440, 1536, 1, 384, 13440, 13440, 13440, 1536] + - [22, 20279.0] + - - [13824, 1536, 1, 384, 13824, 13824, 13824, 1536] + - [46, 20569.0] + - - [14208, 1536, 1, 384, 14208, 14208, 14208, 1536] + - [38, 20287.0] + - - [14592, 1536, 1, 384, 14592, 14592, 14592, 1536] + - [4, 20569.0] + - - [14976, 1536, 1, 384, 14976, 14976, 14976, 1536] + - [22, 20296.0] + - - [15360, 1536, 1, 384, 15360, 15360, 15360, 1536] + - [4, 20572.0] + - - [15744, 1536, 1, 384, 15744, 15744, 15744, 1536] + - [38, 20299.0] + - - [16128, 1536, 1, 384, 16128, 16128, 16128, 1536] + - [4, 20597.0] + - - [16512, 1536, 1, 384, 16512, 16512, 16512, 1536] + - [38, 20321.0] + - - [16896, 1536, 1, 384, 16896, 16896, 16896, 1536] + - [37, 20618.0] + - - [17280, 1536, 1, 384, 17280, 17280, 17280, 1536] + - [38, 20293.0] + - - [17664, 1536, 1, 384, 17664, 17664, 17664, 1536] + - [46, 20598.0] + - - [18048, 1536, 1, 384, 18048, 18048, 18048, 1536] + - [22, 20280.0] + - - [18432, 1536, 1, 384, 18432, 18432, 18432, 1536] + - [4, 20551.0] + - - [18816, 1536, 1, 384, 18816, 18816, 18816, 1536] + - [38, 20288.0] + - - [19200, 1536, 1, 384, 19200, 19200, 19200, 1536] + - [4, 20564.0] + - - [19584, 1536, 1, 384, 19584, 19584, 19584, 1536] + - [22, 20292.0] + - - [19968, 1536, 1, 384, 19968, 19968, 19968, 1536] + - [4, 20597.0] + - - [20352, 1536, 1, 384, 20352, 20352, 20352, 1536] + - [34, 20325.0] + - - [20736, 1536, 1, 384, 20736, 20736, 20736, 1536] + - [37, 20585.0] + - - [21120, 1536, 1, 384, 21120, 21120, 21120, 1536] + - [37, 20327.0] + - - [21504, 1536, 1, 384, 21504, 21504, 21504, 1536] + - [4, 20556.0] + - - [21888, 1536, 1, 384, 21888, 21888, 21888, 1536] + - [37, 20357.0] + - - [22272, 1536, 1, 384, 22272, 22272, 22272, 1536] + - [37, 20610.0] + - - [22656, 1536, 1, 384, 22656, 22656, 22656, 1536] + - [34, 20364.0] + - - [23040, 1536, 1, 384, 23040, 23040, 23040, 1536] + - [37, 20598.0] + - - [768, 1920, 1, 384, 768, 768, 768, 1920] + - [38, 15156.0] + - - [1152, 1920, 1, 384, 1152, 1152, 1152, 1920] + - [38, 17979.0] + - - [1536, 1920, 1, 384, 1536, 1536, 1536, 1920] + - [22, 19168.0] + - - [2304, 1920, 1, 384, 2304, 2304, 2304, 1920] + - [38, 18549.0] + - - [2688, 1920, 1, 384, 2688, 2688, 2688, 1920] + - [5, 19297.0] + - - [3072, 1920, 1, 384, 3072, 3072, 3072, 1920] + - [1, 19816.0] + - - [3456, 1920, 1, 384, 3456, 3456, 3456, 1920] + - [17, 19058.0] + - - [4224, 1920, 1, 384, 4224, 4224, 4224, 1920] + - [34, 19831.0] + - - [4608, 1920, 1, 384, 4608, 4608, 4608, 1920] + - [38, 20077.0] + - - [4992, 1920, 1, 384, 4992, 4992, 4992, 1920] + - [37, 19471.0] + - - [5376, 1920, 1, 384, 5376, 5376, 5376, 1920] + - [37, 19761.0] + - - [5760, 1920, 1, 384, 5760, 5760, 5760, 1920] + - [37, 20073.0] + - - [6144, 1920, 1, 384, 6144, 6144, 6144, 1920] + - [4, 20313.0] + - - [6528, 1920, 1, 384, 6528, 6528, 6528, 1920] + - [21, 19763.0] + - - [6912, 1920, 1, 384, 6912, 6912, 6912, 1920] + - [4, 19986.0] + - - [7296, 1920, 1, 384, 7296, 7296, 7296, 1920] + - [34, 20241.0] + - - [7680, 1920, 1, 384, 7680, 7680, 7680, 1920] + - [30, 20233.0] + - - [8064, 1920, 1, 384, 8064, 8064, 8064, 1920] + - [18, 19966.0] + - - [8448, 1920, 1, 384, 8448, 8448, 8448, 1920] + - [4, 20144.0] + - - [8832, 1920, 1, 384, 8832, 8832, 8832, 1920] + - [4, 20376.0] + - - [9216, 1920, 1, 384, 9216, 9216, 9216, 1920] + - [4, 20501.0] + - - [9600, 1920, 1, 384, 9600, 9600, 9600, 1920] + - [37, 20136.0] + - - [9984, 1920, 1, 384, 9984, 9984, 9984, 1920] + - [37, 20255.0] + - - [10368, 1920, 1, 384, 10368, 10368, 10368, 1920] + - [21, 20438.0] + - - [10752, 1920, 1, 384, 10752, 10752, 10752, 1920] + - [22, 20273.0] + - - [11136, 1920, 1, 384, 11136, 11136, 11136, 1920] + - [37, 20227.0] + - - [11520, 1920, 1, 384, 11520, 11520, 11520, 1920] + - [37, 20351.0] + - - [11904, 1920, 1, 384, 11904, 11904, 11904, 1920] + - [21, 20507.0] + - - [12288, 1920, 1, 384, 12288, 12288, 12288, 1920] + - [4, 20590.0] + - - [12672, 1920, 1, 384, 12672, 12672, 12672, 1920] + - [37, 20319.0] + - - [13056, 1920, 1, 384, 13056, 13056, 13056, 1920] + - [37, 20417.0] + - - [13440, 1920, 1, 384, 13440, 13440, 13440, 1920] + - [37, 20560.0] + - - [13824, 1920, 1, 384, 13824, 13824, 13824, 1920] + - [47, 20318.0] + - - [14208, 1920, 1, 384, 14208, 14208, 14208, 1920] + - [46, 20345.0] + - - [14592, 1920, 1, 384, 14592, 14592, 14592, 1920] + - [34, 20386.0] + - - [14976, 1920, 1, 384, 14976, 14976, 14976, 1920] + - [21, 20518.0] + - - [15360, 1920, 1, 384, 15360, 15360, 15360, 1920] + - [4, 20565.0] + - - [15744, 1920, 1, 384, 15744, 15744, 15744, 1920] + - [34, 20374.0] + - - [16128, 1920, 1, 384, 16128, 16128, 16128, 1920] + - [37, 20444.0] + - - [16512, 1920, 1, 384, 16512, 16512, 16512, 1920] + - [21, 20551.0] + - - [16896, 1920, 1, 384, 16896, 16896, 16896, 1920] + - [22, 20331.0] + - - [17280, 1920, 1, 384, 17280, 17280, 17280, 1920] + - [21, 20425.0] + - - [17664, 1920, 1, 384, 17664, 17664, 17664, 1920] + - [37, 20487.0] + - - [18048, 1920, 1, 384, 18048, 18048, 18048, 1920] + - [37, 20601.0] + - - [18432, 1920, 1, 384, 18432, 18432, 18432, 1920] + - [4, 20605.0] + - - [18816, 1920, 1, 384, 18816, 18816, 18816, 1920] + - [21, 20462.0] + - - [19200, 1920, 1, 384, 19200, 19200, 19200, 1920] + - [37, 20515.0] + - - [19584, 1920, 1, 384, 19584, 19584, 19584, 1920] + - [21, 20609.0] + - - [19968, 1920, 1, 384, 19968, 19968, 19968, 1920] + - [21, 20417.0] + - - [20352, 1920, 1, 384, 20352, 20352, 20352, 1920] + - [37, 20498.0] + - - [20736, 1920, 1, 384, 20736, 20736, 20736, 1920] + - [37, 20564.0] + - - [21120, 1920, 1, 384, 21120, 21120, 21120, 1920] + - [21, 20650.0] + - - [21504, 1920, 1, 384, 21504, 21504, 21504, 1920] + - [4, 20611.0] + - - [21888, 1920, 1, 384, 21888, 21888, 21888, 1920] + - [21, 20538.0] + - - [22272, 1920, 1, 384, 22272, 22272, 22272, 1920] + - [46, 20553.0] + - - [22656, 1920, 1, 384, 22656, 22656, 22656, 1920] + - [37, 20665.0] + - - [23040, 1920, 1, 384, 23040, 23040, 23040, 1920] + - [37, 20483.0] + - - [768, 2304, 1, 384, 768, 768, 768, 2304] + - [35, 17881.0] + - - [1536, 2304, 1, 384, 1536, 1536, 1536, 2304] + - [22, 19391.0] + - - [1920, 2304, 1, 384, 1920, 1920, 1920, 2304] + - [38, 18593.0] + - - [2688, 2304, 1, 384, 2688, 2688, 2688, 2304] + - [34, 19136.0] + - - [3072, 2304, 1, 384, 3072, 3072, 3072, 2304] + - [1, 19985.0] + - - [3456, 2304, 1, 384, 3456, 3456, 3456, 2304] + - [34, 19498.0] + - - [3840, 2304, 1, 384, 3840, 3840, 3840, 2304] + - [5, 19987.0] + - - [4224, 2304, 1, 384, 4224, 4224, 4224, 2304] + - [21, 19738.0] + - - [4992, 2304, 1, 384, 4992, 4992, 4992, 2304] + - [4, 19884.0] + - - [5376, 2304, 1, 384, 5376, 5376, 5376, 2304] + - [22, 20108.0] + - - [5760, 2304, 1, 384, 5760, 5760, 5760, 2304] + - [37, 20016.0] + - - [6144, 2304, 1, 384, 6144, 6144, 6144, 2304] + - [4, 20391.0] + - - [6528, 2304, 1, 384, 6528, 6528, 6528, 2304] + - [37, 20158.0] + - - [6912, 2304, 1, 384, 6912, 6912, 6912, 2304] + - [47, 20198.0] + - - [7296, 2304, 1, 384, 7296, 7296, 7296, 2304] + - [21, 20225.0] + - - [7680, 2304, 1, 384, 7680, 7680, 7680, 2304] + - [37, 20491.0] + - - [8064, 2304, 1, 384, 8064, 8064, 8064, 2304] + - [21, 20266.0] + - - [8448, 2304, 1, 384, 8448, 8448, 8448, 2304] + - [22, 20270.0] + - - [8832, 2304, 1, 384, 8832, 8832, 8832, 2304] + - [37, 20329.0] + - - [9216, 2304, 1, 384, 9216, 9216, 9216, 2304] + - [4, 20546.0] + - - [9600, 2304, 1, 384, 9600, 9600, 9600, 2304] + - [37, 20374.0] + - - [9984, 2304, 1, 384, 9984, 9984, 9984, 2304] + - [38, 20281.0] + - - [10368, 2304, 1, 384, 10368, 10368, 10368, 2304] + - [37, 20430.0] + - - [10752, 2304, 1, 384, 10752, 10752, 10752, 2304] + - [37, 20614.0] + - - [11136, 2304, 1, 384, 11136, 11136, 11136, 2304] + - [34, 20439.0] + - - [11520, 2304, 1, 384, 11520, 11520, 11520, 2304] + - [47, 20280.0] + - - [11904, 2304, 1, 384, 11904, 11904, 11904, 2304] + - [46, 20423.0] + - - [12288, 2304, 1, 384, 12288, 12288, 12288, 2304] + - [4, 20531.0] + - - [12672, 2304, 1, 384, 12672, 12672, 12672, 2304] + - [37, 20435.0] + - - [13056, 2304, 1, 384, 13056, 13056, 13056, 2304] + - [22, 20298.0] + - - [13440, 2304, 1, 384, 13440, 13440, 13440, 2304] + - [37, 20470.0] + - - [13824, 2304, 1, 384, 13824, 13824, 13824, 2304] + - [37, 20589.0] + - - [14208, 2304, 1, 384, 14208, 14208, 14208, 2304] + - [18, 20508.0] + - - [14592, 2304, 1, 384, 14592, 14592, 14592, 2304] + - [37, 20340.0] + - - [14976, 2304, 1, 384, 14976, 14976, 14976, 2304] + - [37, 20514.0] + - - [15360, 2304, 1, 384, 15360, 15360, 15360, 2304] + - [4, 20581.0] + - - [15744, 2304, 1, 384, 15744, 15744, 15744, 2304] + - [21, 20527.0] + - - [16128, 2304, 1, 384, 16128, 16128, 16128, 2304] + - [37, 20401.0] + - - [16512, 2304, 1, 384, 16512, 16512, 16512, 2304] + - [21, 20562.0] + - - [16896, 2304, 1, 384, 16896, 16896, 16896, 2304] + - [37, 20651.0] + - - [17280, 2304, 1, 384, 17280, 17280, 17280, 2304] + - [37, 20574.0] + - - [17664, 2304, 1, 384, 17664, 17664, 17664, 2304] + - [37, 20440.0] + - - [18048, 2304, 1, 384, 18048, 18048, 18048, 2304] + - [37, 20588.0] + - - [18432, 2304, 1, 384, 18432, 18432, 18432, 2304] + - [4, 20594.0] + - - [18816, 2304, 1, 384, 18816, 18816, 18816, 2304] + - [21, 20608.0] + - - [19200, 2304, 1, 384, 19200, 19200, 19200, 2304] + - [37, 20487.0] + - - [19584, 2304, 1, 384, 19584, 19584, 19584, 2304] + - [37, 20626.0] + - - [19968, 2304, 1, 384, 19968, 19968, 19968, 2304] + - [37, 20664.0] + - - [20352, 2304, 1, 384, 20352, 20352, 20352, 2304] + - [21, 20633.0] + - - [20736, 2304, 1, 384, 20736, 20736, 20736, 2304] + - [37, 20526.0] + - - [21120, 2304, 1, 384, 21120, 21120, 21120, 2304] + - [21, 20640.0] + - - [21504, 2304, 1, 384, 21504, 21504, 21504, 2304] + - [4, 20666.0] + - - [21888, 2304, 1, 384, 21888, 21888, 21888, 2304] + - [37, 20658.0] + - - [22272, 2304, 1, 384, 22272, 22272, 22272, 2304] + - [4, 20541.0] + - - [22656, 2304, 1, 384, 22656, 22656, 22656, 2304] + - [37, 20659.0] + - - [23040, 2304, 1, 384, 23040, 23040, 23040, 2304] + - [37, 20707.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [9, 543.0] + - - [289, 128, 64, 768, 289, 289, 289, 128] + - [17, 13878.0] + - - [289, 160, 64, 768, 289, 289, 289, 160] + - [17, 12163.0] + - - [289, 192, 64, 768, 289, 289, 289, 192] + - [17, 14561.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [33, 18486.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [9, 16695.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [17, 16034.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [33, 14504.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [0, 17640.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [33, 17318.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [17, 17144.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [0, 18635.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [5, 18336.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [5, 18085.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [0, 18341.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [19, 16933.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [33, 16924.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [21, 21041.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [37, 20592.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [34, 19118.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [4, 21037.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [37, 20588.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [22, 19147.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [21, 21062.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [37, 20468.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [17, 18540.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [33, 18336.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [0, 18604.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [33, 18227.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [33, 18509.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [1, 19524.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [17, 18485.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [1, 20382.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [33, 18613.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [1, 19981.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [33, 18616.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [21, 20431.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [17, 18612.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [17, 18642.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [21, 20401.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [33, 18615.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [33, 18564.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [1, 20144.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [17, 18556.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [4, 20196.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [33, 18449.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [38, 19348.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [33, 18438.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [42, 18501.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [42, 19418.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [33, 18477.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [33, 18617.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [21, 20887.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [4, 20873.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [3, 18651.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [36, 18603.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [36, 18611.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [33, 18620.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [33, 18634.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [33, 18618.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [33, 18626.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [37, 20848.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [4, 20838.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [37, 20843.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [36, 18623.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [36, 18604.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [20, 18584.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [36, 18597.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [36, 18597.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [20, 18602.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [9, 355.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [9, 413.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [9, 949.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [4, 20850.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [18, 19984.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [4, 20360.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [21, 20335.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [21, 20670.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [17, 18837.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [4, 20598.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [37, 20632.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [33, 18843.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [36, 19201.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [0, 19062.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [3, 1925.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [37, 20447.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [38, 18028.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [38, 19919.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [37, 20523.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [21, 20950.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [37, 20661.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [21, 20666.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [23, 19970.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [23, 20574.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [4, 20949.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [21, 20704.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [21, 20699.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [36, 18869.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [13, 18627.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [21, 20929.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [28, 18617.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [4, 20883.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [4, 20905.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [37, 20937.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [37, 20626.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [9, 336.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [21, 20396.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [37, 20519.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [20, 18610.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [33, 18625.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [37, 20541.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [33, 18619.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [18, 20532.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [45, 18607.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [18, 19673.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [33, 18620.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [25, 19976.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [42, 18594.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [0, 17848.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [18, 17626.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [0, 17521.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [21, 19850.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [3, 14717.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [1, 17574.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [18, 17943.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [34, 15167.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [37, 15466.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [33, 18591.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [33, 16957.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [0, 19012.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [37, 19399.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [33, 14100.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [5, 17380.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [10, 17750.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [33, 14654.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [4, 14736.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [33, 18642.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [28, 18641.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [3, 18850.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [3, 18853.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [20, 18645.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [3, 18861.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [20, 18644.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [3, 18842.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [45, 18638.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [28, 18861.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [33, 18640.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [23, 17874.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [23, 17867.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [23, 17862.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [36, 18885.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [21, 20381.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [21, 20525.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [17, 9022.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [21, 20625.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [21, 20382.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [9, 10510.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [37, 20614.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [37, 20186.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [1, 15626.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [1, 15714.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [0, 16319.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [36, 15447.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [4, 17797.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [1, 17934.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [21, 20574.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [20, 18646.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [37, 20583.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [45, 18629.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [36, 12300.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [36, 12353.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [45, 12443.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [36, 12452.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [21, 20267.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [21, 20000.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [37, 19867.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [37, 20018.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [4, 20700.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [4, 20502.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [4, 20427.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [37, 20629.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [38, 19781.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [38, 19869.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [22, 19960.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [5, 19956.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [33, 18650.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [17, 18644.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [1, 19697.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [21, 19591.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [21, 19764.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [37, 19722.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [4, 19619.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [37, 19814.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [18, 19420.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [21, 19478.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [21, 19892.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [37, 19755.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [17, 19354.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [37, 19402.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [1, 17716.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [21, 19708.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [37, 20281.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [33, 16907.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [37, 20142.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [11, 18875.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [33, 19040.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [38, 18268.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [30, 18907.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [37, 20355.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [33, 18315.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [30, 17914.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [1, 18326.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [38, 18742.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [0, 15953.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [38, 19618.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [22, 19521.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [38, 17402.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [38, 18189.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [5, 19784.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [15, 19617.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [33, 17357.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [33, 18504.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [0, 19123.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [17, 17219.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [22, 19472.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [5, 19264.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [17, 16483.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [37, 20427.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [33, 16930.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [14, 20108.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [5, 18783.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [42, 19169.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [33, 18240.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [17, 17621.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [33, 17629.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [33, 18589.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [33, 16080.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [17, 15761.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [9, 16718.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [22, 19200.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [17, 17298.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [10, 19005.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [1, 18976.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 256] + - [38, 16895.0] + - - [888, 2048, 2, 512, 888, 888, 888, 2048] + - [17, 18158.0] + - - [12880, 512, 2, 256, 12880, 12880, 12880, 512] + - [33, 19374.0] + - - [12288, 512, 2, 256, 12288, 12288, 12288, 512] + - [1, 19603.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 128] + - [38, 19102.0] + - - [864, 2048, 2, 256, 864, 864, 864, 2048] + - [17, 17276.0] + - - [12672, 128, 2, 512, 12672, 12672, 12672, 128] + - [38, 18069.0] + - - [11264, 128, 2, 512, 11264, 11264, 11264, 128] + - [38, 18933.0] + - - [11776, 128, 2, 512, 11776, 11776, 11776, 128] + - [33, 17786.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 128] + - [5, 19668.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 1024] + - [0, 18707.0] + - - [14000, 128, 2, 512, 14000, 14000, 14000, 128] + - [33, 17678.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 512] + - [33, 19286.0] + - - [805, 2048, 2, 256, 805, 805, 805, 2048] + - [0, 16048.0] + - - [768, 2048, 2, 256, 768, 768, 768, 2048] + - [0, 18162.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 1024] + - [1, 18369.0] + - - [1251, 256, 49, 256, 1251, 1251, 1251, 256] + - [0, 19021.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 256] + - [38, 17593.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 1024] + - [0, 18263.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 1024] + - [0, 18575.0] + - - [15200, 256, 2, 12, 15200, 15200, 15200, 256] + - [22, 5392.0] + - - [12880, 256, 2, 12, 12880, 12880, 12880, 256] + - [5, 5276.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 1024] + - [5, 18826.0] + - - [13600, 256, 2, 12, 13600, 13600, 13600, 256] + - [47, 5343.0] + - - [15200, 256, 2, 3, 15200, 15200, 15200, 256] + - [35, 1497.0] + - - [12880, 256, 2, 3, 12880, 12880, 12880, 256] + - [19, 1476.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 1024] + - [15, 19175.0] + - - [12288, 256, 2, 12, 12288, 12288, 12288, 256] + - [13, 5347.0] + - - [13824, 256, 2, 12, 13824, 13824, 13824, 256] + - [3, 5438.0] + - - [13600, 256, 2, 3, 13600, 13600, 13600, 256] + - [5, 1505.0] + - - [1900, 1024, 1, 2048, 1900, 1900, 1900, 1024] + - [33, 17939.0] + - - [7600, 512, 1, 256, 7600, 7600, 7600, 512] + - [2, 17968.0] + - - [1610, 1024, 1, 2048, 1610, 1610, 1610, 1024] + - [15, 18056.0] + - - [6144, 512, 1, 256, 6144, 6144, 6144, 512] + - [0, 18089.0] + - - [1900, 1024, 1, 512, 1900, 1900, 1900, 1024] + - [0, 17336.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [37, 20517.0] + - - [3220, 256, 2, 12, 3220, 3220, 3220, 256] + - [5, 3691.0] + - - [3220, 256, 2, 3, 3220, 3220, 3220, 256] + - [19, 1066.0] + - - [3800, 256, 2, 3, 3800, 3800, 3800, 256] + - [33, 1089.0] + - - [13824, 256, 2, 3, 13824, 13824, 13824, 256] + - [3, 1506.0] + - - [12288, 256, 2, 3, 12288, 12288, 12288, 256] + - [0, 1547.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 256] + - [17, 17174.0] + - - [3072, 256, 2, 12, 3072, 3072, 3072, 256] + - [0, 3630.0] + - - [3800, 256, 2, 12, 3800, 3800, 3800, 256] + - [38, 3865.0] + - - [3072, 256, 2, 3, 3072, 3072, 3072, 256] + - [0, 1102.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 256] + - [33, 15933.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 512] + - [47, 19731.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 256] + - [17, 15234.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 256] + - [17, 14929.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 1024] + - [5, 18731.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 256] + - [38, 16579.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 1024] + - [5, 18648.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 256] + - [0, 17961.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 256] + - [5, 16980.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 1024] + - [17, 18450.0] + - - [3456, 256, 2, 3, 3456, 3456, 3456, 256] + - [35, 1129.0] + - - [3400, 256, 2, 3, 3400, 3400, 3400, 256] + - [19, 1079.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 1024] + - [38, 19249.0] + - - [3456, 256, 2, 12, 3456, 3456, 3456, 256] + - [5, 4006.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 1024] + - [33, 18796.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 256] + - [19, 17418.0] + - - [850, 2048, 2, 256, 850, 850, 850, 2048] + - [0, 16983.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 256] + - [5, 16763.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 1024] + - [0, 18142.0] + - - [51520, 256, 2, 12, 51520, 51520, 51520, 256] + - [10, 7483.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 256] + - [38, 16993.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 1024] + - [9, 18338.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 1024] + - [17, 18881.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 256] + - [5, 16054.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 256] + - [5, 17882.0] + - - [54400, 256, 2, 12, 54400, 54400, 54400, 256] + - [19, 7712.0] + - - [950, 2048, 2, 256, 950, 950, 950, 2048] + - [33, 16602.0] + - - [55296, 256, 2, 3, 55296, 55296, 55296, 256] + - [14, 1972.0] + - - [60800, 256, 2, 12, 60800, 60800, 60800, 256] + - [43, 8391.0] + - - [51520, 256, 2, 3, 51520, 51520, 51520, 256] + - [15, 1755.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 1024] + - [5, 19048.0] + - - [55296, 256, 2, 12, 55296, 55296, 55296, 256] + - [13, 6814.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 256] + - [22, 15680.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 1024] + - [5, 18837.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 256] + - [33, 17161.0] + - - [60800, 256, 2, 3, 60800, 60800, 60800, 256] + - [35, 2308.0] + - - [1269, 256, 49, 256, 1269, 1269, 1269, 256] + - [33, 19226.0] + - - [1467, 256, 49, 256, 1467, 1467, 1467, 256] + - [1, 18736.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 256] + - [0, 16185.0] + - - [952, 256, 64, 256, 952, 952, 952, 256] + - [38, 18231.0] + - - [49152, 256, 2, 12, 49152, 49152, 49152, 256] + - [4, 5952.0] + - - [1449, 256, 49, 256, 1449, 1449, 1449, 256] + - [38, 18525.0] + - - [1278, 256, 49, 256, 1278, 1278, 1278, 256] + - [17, 19331.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 256] + - [19, 18285.0] + - - [736, 256, 64, 256, 736, 736, 736, 256] + - [1, 18626.0] + - - [1413, 256, 49, 256, 1413, 1413, 1413, 256] + - [38, 18166.0] + - - [600, 256, 64, 256, 600, 600, 600, 256] + - [5, 18255.0] + - - [1341, 256, 49, 256, 1341, 1341, 1341, 256] + - [5, 18884.0] + - - [1287, 256, 49, 256, 1287, 1287, 1287, 256] + - [38, 18207.0] + - - [1332, 256, 49, 256, 1332, 1332, 1332, 256] + - [38, 18888.0] + - - [1359, 256, 49, 256, 1359, 1359, 1359, 256] + - [5, 19144.0] + - - [1440, 256, 49, 256, 1440, 1440, 1440, 256] + - [22, 18603.0] + - - [1395, 256, 49, 256, 1395, 1395, 1395, 256] + - [47, 19337.0] + - - [1323, 256, 49, 256, 1323, 1323, 1323, 256] + - [22, 18765.0] + - - [1404, 256, 49, 256, 1404, 1404, 1404, 256] + - [38, 19759.0] + - - [1386, 256, 49, 256, 1386, 1386, 1386, 256] + - [22, 19475.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 1024] + - [17, 18353.0] + - - [1350, 256, 49, 256, 1350, 1350, 1350, 256] + - [47, 18806.0] + - - [1368, 256, 49, 256, 1368, 1368, 1368, 256] + - [38, 19271.0] + - - [49152, 256, 2, 3, 49152, 49152, 49152, 256] + - [9, 1497.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 256] + - [33, 16685.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 1024] + - [0, 18748.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 1024] + - [5, 18946.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 256] + - [5, 17670.0] + - - [690, 256, 64, 256, 690, 690, 690, 256] + - [38, 17396.0] + - - [54400, 256, 2, 3, 54400, 54400, 54400, 256] + - [11, 1960.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 1024] + - [22, 19015.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 256] + - [0, 16470.0] + - - [616, 256, 64, 256, 616, 616, 616, 256] + - [38, 18823.0] + - - [3008, 256, 64, 256, 3008, 3008, 3008, 256] + - [1, 19714.0] + - - [896, 256, 64, 256, 896, 896, 896, 256] + - [18, 19935.0] + - - [768, 256, 64, 256, 768, 768, 768, 256] + - [1, 19454.0] + - - [660, 256, 64, 256, 660, 660, 660, 256] + - [0, 16658.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 256] + - [3, 16542.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 1024] + - [9, 19007.0] + - - [800, 256, 64, 256, 800, 800, 800, 256] + - [34, 17884.0] + - - [1120, 256, 49, 256, 1120, 1120, 1120, 256] + - [18, 19219.0] + - - [2408, 256, 64, 256, 2408, 2408, 2408, 256] + - [21, 19559.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 256] + - [5, 19069.0] + - - [672, 256, 64, 256, 672, 672, 672, 256] + - [17, 17008.0] + - - [782, 256, 64, 256, 782, 782, 782, 256] + - [30, 17267.0] + - - [884, 256, 64, 256, 884, 884, 884, 256] + - [22, 19398.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 1024] + - [43, 19580.0] + - - [1064, 256, 49, 256, 1064, 1064, 1064, 256] + - [1, 18138.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 256] + - [19, 18569.0] + - - [704, 256, 64, 256, 704, 704, 704, 256] + - [0, 17805.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 1024] + - [5, 19250.0] + - - [3264, 256, 64, 256, 3264, 3264, 3264, 256] + - [37, 19926.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 1024] + - [17, 18665.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 256] + - [17, 17628.0] + - - [6440, 512, 1, 256, 6440, 6440, 6440, 512] + - [5, 17542.0] + - - [6912, 512, 1, 256, 6912, 6912, 6912, 512] + - [38, 19097.0] + - - [6800, 512, 1, 256, 6800, 6800, 6800, 512] + - [5, 18257.0] + - - [6800, 512, 1, 1024, 6800, 6800, 6800, 512] + - [22, 19391.0] + - - [6440, 512, 1, 1024, 6440, 6440, 6440, 512] + - [22, 18398.0] + - - [6912, 512, 1, 1024, 6912, 6912, 6912, 512] + - [38, 19898.0] + - - [1728, 1024, 1, 512, 1728, 1728, 1728, 1024] + - [33, 15811.0] + - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] + - [38, 16918.0] + - - [1610, 1024, 1, 512, 1610, 1610, 1610, 1024] + - [5, 17157.0] + - - [7600, 512, 1, 1024, 7600, 7600, 7600, 512] + - [5, 18904.0] + - - [6144, 512, 1, 1024, 6144, 6144, 6144, 512] + - [33, 18704.0] + - - [1700, 1024, 1, 512, 1700, 1700, 1700, 1024] + - [33, 15539.0] + - - [1728, 1024, 1, 2048, 1728, 1728, 1728, 1024] + - [33, 16347.0] + - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] + - [38, 17482.0] + - - [1700, 1024, 1, 2048, 1700, 1700, 1700, 1024] + - [33, 16077.0] + - - [1920, 25216, 1, 16384, 1920, 1920, 1920, 25216] + - [21, 20862.0] + - - [3840, 1920, 1, 16384, 3840, 3840, 3840, 1920] + - [4, 19894.0] + - - [1920, 3840, 1, 16384, 1920, 1920, 1920, 3840] + - [29, 19868.0] + - - [960, 1920, 1, 16384, 960, 960, 960, 1920] + - [33, 17113.0] + - - [1920, 2880, 1, 16384, 1920, 1920, 1920, 2880] + - [20, 19500.0] + - - [1920, 25216, 1, 4096, 1920, 1920, 1920, 25216] + - [37, 20856.0] + - - [3840, 1920, 1, 4096, 3840, 3840, 3840, 1920] + - [4, 19810.0] + - - [1920, 3840, 1, 4096, 1920, 1920, 1920, 3840] + - [37, 19798.0] + - - [960, 1920, 1, 4096, 960, 960, 960, 1920] + - [33, 17109.0] + - - [1920, 2880, 1, 4096, 1920, 1920, 1920, 2880] + - [0, 19463.0] + - - [1920, 25216, 1, 8192, 1920, 1920, 1920, 25216] + - [37, 20858.0] + - - [3840, 1920, 1, 8192, 3840, 3840, 3840, 1920] + - [4, 19857.0] + - - [1920, 3840, 1, 8192, 1920, 1920, 1920, 3840] + - [37, 19878.0] + - - [960, 1920, 1, 8192, 960, 960, 960, 1920] + - [42, 17159.0] + - - [1920, 2880, 1, 8192, 1920, 1920, 1920, 2880] + - [36, 19475.0] + - - [2304, 12672, 1, 16384, 2304, 2304, 2304, 12672] + - [37, 20789.0] + - - [2304, 2304, 1, 16384, 2304, 2304, 2304, 2304] + - [5, 20356.0] + - - [576, 2304, 1, 16384, 576, 576, 576, 2304] + - [5, 14873.0] + - - [2304, 1728, 1, 16384, 2304, 2304, 2304, 1728] + - [5, 19535.0] + - - [2304, 12672, 1, 4096, 2304, 2304, 2304, 12672] + - [21, 20758.0] + - - [2304, 2304, 1, 4096, 2304, 2304, 2304, 2304] + - [38, 20257.0] + - - [576, 2304, 1, 4096, 576, 576, 576, 2304] + - [38, 14777.0] + - - [2304, 1728, 1, 4096, 2304, 2304, 2304, 1728] + - [38, 19418.0] + - - [2304, 12672, 1, 8192, 2304, 2304, 2304, 12672] + - [21, 20789.0] + - - [2304, 2304, 1, 8192, 2304, 2304, 2304, 2304] + - [22, 20301.0] + - - [576, 2304, 1, 8192, 576, 576, 576, 2304] + - [22, 14816.0] + - - [2304, 1728, 1, 8192, 2304, 2304, 2304, 1728] + - [38, 19480.0] + - - [3072, 6400, 1, 4096, 3072, 3072, 3072, 6400] + - [37, 20497.0] + - - [1536, 3072, 1, 4096, 1536, 1536, 1536, 3072] + - [39, 20318.0] + - - [3072, 1536, 1, 4096, 3072, 3072, 3072, 1536] + - [6, 20314.0] + - - [384, 3072, 1, 4096, 384, 384, 384, 3072] + - [41, 19157.0] + - - [3072, 1152, 1, 4096, 3072, 3072, 3072, 1152] + - [5, 20099.0] + - - [3072, 6400, 1, 8192, 3072, 3072, 3072, 6400] + - [21, 20524.0] + - - [1536, 3072, 1, 8192, 1536, 1536, 1536, 3072] + - [23, 20419.0] + - - [3072, 1536, 1, 8192, 3072, 3072, 3072, 1536] + - [39, 20392.0] + - - [384, 3072, 1, 8192, 384, 384, 384, 3072] + - [41, 19204.0] + - - [3072, 1152, 1, 8192, 3072, 3072, 3072, 1152] + - [22, 20130.0] + - - [2048, 2048, 1, 4096, 2048, 2048, 2048, 2048] + - [0, 18613.0] + - - [2048, 2048, 1, 8, 2048, 2048, 2048, 2048] + - [0, 3631.0] + - - [2048, 29000, 1, 199, 2048, 2048, 2048, 29000] + - [1, 19965.0] + - - [2048, 29000, 1, 221, 2048, 2048, 2048, 29000] + - [21, 20037.0] + - - [2048, 29000, 1, 224, 2048, 2048, 2048, 29000] + - [21, 20131.0] + - - [2048, 29000, 1, 229, 2048, 2048, 2048, 29000] + - [21, 20135.0] + - - [2048, 29000, 1, 234, 2048, 2048, 2048, 29000] + - [21, 20128.0] + - - [2048, 29000, 1, 242, 2048, 2048, 2048, 29000] + - [21, 20187.0] + - - [2048, 29000, 1, 246, 2048, 2048, 2048, 29000] + - [21, 20220.0] + - - [2048, 29000, 1, 247, 2048, 2048, 2048, 29000] + - [21, 20241.0] + - - [2048, 29000, 1, 256, 2048, 2048, 2048, 29000] + - [21, 20362.0] + - - [2048, 29000, 1, 262, 2048, 2048, 2048, 29000] + - [21, 20282.0] + - - [2048, 29000, 1, 264, 2048, 2048, 2048, 29000] + - [21, 20291.0] + - - [2048, 29000, 1, 265, 2048, 2048, 2048, 29000] + - [21, 20273.0] + - - [2048, 29000, 1, 274, 2048, 2048, 2048, 29000] + - [21, 20309.0] + - - [2048, 29000, 1, 277, 2048, 2048, 2048, 29000] + - [21, 20296.0] + - - [2048, 29000, 1, 279, 2048, 2048, 2048, 29000] + - [21, 20340.0] + - - [2048, 29000, 1, 288, 2048, 2048, 2048, 29000] + - [21, 20456.0] + - - [2048, 29000, 1, 296, 2048, 2048, 2048, 29000] + - [21, 20401.0] + - - [2048, 29000, 1, 315, 2048, 2048, 2048, 29000] + - [4, 20402.0] + - - [2048, 29000, 1, 335, 2048, 2048, 2048, 29000] + - [4, 20459.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [37, 19597.0] + - - [4096, 2048, 1, 4096, 4096, 4096, 4096, 2048] + - [4, 19622.0] + - - [1024, 29000, 1, 2283, 1024, 1024, 1024, 29000] + - [21, 20703.0] + - - [1024, 29000, 1, 2296, 1024, 1024, 1024, 29000] + - [37, 20681.0] + - - [1024, 29000, 1, 2306, 1024, 1024, 1024, 29000] + - [21, 20720.0] + - - [1024, 29000, 1, 2309, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 2318, 1024, 1024, 1024, 29000] + - [37, 20684.0] + - - [1024, 29000, 1, 2320, 1024, 1024, 1024, 29000] + - [37, 20687.0] + - - [1024, 29000, 1, 2324, 1024, 1024, 1024, 29000] + - [37, 20688.0] + - - [1024, 29000, 1, 2325, 1024, 1024, 1024, 29000] + - [21, 20688.0] + - - [1024, 29000, 1, 2329, 1024, 1024, 1024, 29000] + - [37, 20686.0] + - - [1024, 29000, 1, 2338, 1024, 1024, 1024, 29000] + - [21, 20686.0] + - - [1024, 29000, 1, 2345, 1024, 1024, 1024, 29000] + - [21, 20688.0] + - - [1024, 29000, 1, 2350, 1024, 1024, 1024, 29000] + - [21, 20704.0] + - - [1024, 29000, 1, 2362, 1024, 1024, 1024, 29000] + - [37, 20690.0] + - - [1024, 29000, 1, 2366, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 2368, 1024, 1024, 1024, 29000] + - [37, 20694.0] + - - [1024, 29000, 1, 2374, 1024, 1024, 1024, 29000] + - [37, 20687.0] + - - [1024, 29000, 1, 2390, 1024, 1024, 1024, 29000] + - [21, 20689.0] + - - [1024, 29000, 1, 561, 1024, 1024, 1024, 29000] + - [21, 20391.0] + - - [1024, 29000, 1, 574, 1024, 1024, 1024, 29000] + - [21, 20397.0] + - - [1024, 29000, 1, 600, 1024, 1024, 1024, 29000] + - [21, 20433.0] + - - [1024, 29000, 1, 608, 1024, 1024, 1024, 29000] + - [21, 20432.0] + - - [1024, 29000, 1, 615, 1024, 1024, 1024, 29000] + - [21, 20434.0] + - - [1024, 29000, 1, 622, 1024, 1024, 1024, 29000] + - [21, 20426.0] + - - [1024, 29000, 1, 625, 1024, 1024, 1024, 29000] + - [21, 20410.0] + - - [1024, 29000, 1, 626, 1024, 1024, 1024, 29000] + - [21, 20442.0] + - - [1024, 29000, 1, 628, 1024, 1024, 1024, 29000] + - [21, 20443.0] + - - [1024, 29000, 1, 636, 1024, 1024, 1024, 29000] + - [21, 20436.0] + - - [1024, 29000, 1, 651, 1024, 1024, 1024, 29000] + - [21, 20427.0] + - - [1024, 29000, 1, 658, 1024, 1024, 1024, 29000] + - [21, 20451.0] + - - [1024, 29000, 1, 669, 1024, 1024, 1024, 29000] + - [21, 20452.0] + - - [1024, 29000, 1, 670, 1024, 1024, 1024, 29000] + - [21, 20451.0] + - - [1024, 29000, 1, 672, 1024, 1024, 1024, 29000] + - [21, 20470.0] + - - [1024, 29000, 1, 684, 1024, 1024, 1024, 29000] + - [21, 20450.0] + - - [1024, 29000, 1, 716, 1024, 1024, 1024, 29000] + - [21, 20469.0] + - - [1024, 29000, 1, 730, 1024, 1024, 1024, 29000] + - [21, 20476.0] + - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] + - [33, 18987.0] + - - [2560, 2560, 1, 2, 2560, 2560, 2560, 2560] + - [3, 980.0] + - - [2560, 29000, 1, 109, 2560, 2560, 2560, 29000] + - [0, 19111.0] + - - [2560, 29000, 1, 121, 2560, 2560, 2560, 29000] + - [15, 19117.0] + - - [2560, 29000, 1, 27, 2560, 2560, 2560, 29000] + - [32, 5335.0] + - - [2560, 29000, 1, 35, 2560, 2560, 2560, 29000] + - [17, 6709.0] + - - [2560, 29000, 1, 36, 2560, 2560, 2560, 29000] + - [8, 6896.0] + - - [2560, 29000, 1, 39, 2560, 2560, 2560, 29000] + - [20, 7463.0] + - - [2560, 29000, 1, 40, 2560, 2560, 2560, 29000] + - [16, 7625.0] + - - [2560, 29000, 1, 42, 2560, 2560, 2560, 29000] + - [16, 8005.0] + - - [2560, 29000, 1, 43, 2560, 2560, 2560, 29000] + - [11, 8168.0] + - - [2560, 29000, 1, 44, 2560, 2560, 2560, 29000] + - [28, 8354.0] + - - [2560, 29000, 1, 46, 2560, 2560, 2560, 29000] + - [2, 8731.0] + - - [2560, 29000, 1, 48, 2560, 2560, 2560, 29000] + - [17, 9072.0] + - - [2560, 29000, 1, 49, 2560, 2560, 2560, 29000] + - [20, 9295.0] + - - [2560, 29000, 1, 50, 2560, 2560, 2560, 29000] + - [20, 9482.0] + - - [2560, 29000, 1, 51, 2560, 2560, 2560, 29000] + - [3, 9653.0] + - - [2560, 29000, 1, 53, 2560, 2560, 2560, 29000] + - [11, 10008.0] + - - [2560, 29000, 1, 54, 2560, 2560, 2560, 29000] + - [8, 10168.0] + - - [2560, 29000, 1, 55, 2560, 2560, 2560, 29000] + - [13, 10367.0] + - - [2560, 29000, 1, 56, 2560, 2560, 2560, 29000] + - [3, 10557.0] + - - [2560, 29000, 1, 57, 2560, 2560, 2560, 29000] + - [13, 10721.0] + - - [2560, 29000, 1, 58, 2560, 2560, 2560, 29000] + - [13, 10913.0] + - - [2560, 29000, 1, 59, 2560, 2560, 2560, 29000] + - [2, 11095.0] + - - [2560, 29000, 1, 61, 2560, 2560, 2560, 29000] + - [13, 11472.0] + - - [2560, 29000, 1, 63, 2560, 2560, 2560, 29000] + - [2, 11792.0] + - - [2560, 29000, 1, 65, 2560, 2560, 2560, 29000] + - [2, 12136.0] + - - [2560, 29000, 1, 66, 2560, 2560, 2560, 29000] + - [2, 12345.0] + - - [2560, 29000, 1, 67, 2560, 2560, 2560, 29000] + - [13, 12466.0] + - - [2560, 29000, 1, 69, 2560, 2560, 2560, 29000] + - [2, 12871.0] + - - [2560, 29000, 1, 70, 2560, 2560, 2560, 29000] + - [13, 13012.0] + - - [2560, 29000, 1, 71, 2560, 2560, 2560, 29000] + - [2, 13217.0] + - - [2560, 29000, 1, 73, 2560, 2560, 2560, 29000] + - [2, 13558.0] + - - [2560, 29000, 1, 74, 2560, 2560, 2560, 29000] + - [2, 13762.0] + - - [2560, 29000, 1, 75, 2560, 2560, 2560, 29000] + - [0, 13831.0] + - - [2560, 29000, 1, 77, 2560, 2560, 2560, 29000] + - [17, 14093.0] + - - [2560, 29000, 1, 78, 2560, 2560, 2560, 29000] + - [2, 14297.0] + - - [2560, 29000, 1, 80, 2560, 2560, 2560, 29000] + - [2, 14787.0] + - - [2560, 29000, 1, 81, 2560, 2560, 2560, 29000] + - [5, 14846.0] + - - [2560, 29000, 1, 82, 2560, 2560, 2560, 29000] + - [5, 15008.0] + - - [2560, 29000, 1, 83, 2560, 2560, 2560, 29000] + - [5, 15253.0] + - - [2560, 29000, 1, 84, 2560, 2560, 2560, 29000] + - [5, 15391.0] + - - [2560, 29000, 1, 88, 2560, 2560, 2560, 29000] + - [0, 16042.0] + - - [2560, 29000, 1, 89, 2560, 2560, 2560, 29000] + - [5, 16175.0] + - - [2560, 29000, 1, 90, 2560, 2560, 2560, 29000] + - [5, 16289.0] + - - [2560, 29000, 1, 92, 2560, 2560, 2560, 29000] + - [5, 16598.0] + - - [2560, 29000, 1, 95, 2560, 2560, 2560, 29000] + - [15, 16915.0] + - - [2560, 29000, 1, 98, 2560, 2560, 2560, 29000] + - [5, 17675.0] + - - [2560, 4096, 1, 1024, 2560, 2560, 2560, 4096] + - [37, 20303.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 2560] + - [46, 20348.0] + - - [1024, 3072, 1, 32768, 1024, 1024, 1024, 3072] + - [20, 18912.0] + - - [1024, 4096, 1, 32768, 1024, 1024, 1024, 4096] + - [20, 18660.0] + - - [1024, 50304, 1, 32768, 1024, 1024, 1024, 50304] + - [6, 20559.0] + - - [4096, 1024, 1, 32768, 4096, 4096, 4096, 1024] + - [45, 18639.0] + - - [1024, 128, 24, 1024, 1024, 1024, 1024, 128] + - [17, 18566.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [33, 18622.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [52, 16310.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [54, 18032.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [48, 18126.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [58, 15943.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [51, 16054.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [51, 16140.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [53, 16310.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [49, 17941.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [57, 18211.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [55, 18330.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [54, 17952.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [55, 18345.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [56, 17923.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [55, 18345.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [56, 18184.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [50, 17448.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [55, 16390.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [55, 14661.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [48, 17929.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [48, 18066.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [48, 18058.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [48, 18111.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [48, 18006.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [50, 16473.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [54, 18262.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [48, 18090.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [54, 18111.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [48, 18099.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [48, 18100.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [48, 18089.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [48, 18120.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [48, 18127.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [54, 18099.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [48, 18189.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [88, 15157.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [63, 14094.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [74, 17007.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [96, 16838.0] + - - [768, 768, 1, 16, 768, 768, 768, 768] + - [92, 3775.0] + - - [768, 768, 1, 320, 768, 768, 768, 768] + - [63, 13579.0] + - - [768, 768, 1, 4096, 768, 768, 768, 768] + - [116, 16815.0] + - - [768, 768, 1, 32, 768, 768, 768, 768] + - [95, 5519.0] + - - [768, 768, 1, 640, 768, 768, 768, 768] + - [63, 15063.0] + - - [768, 768, 1, 64, 768, 768, 768, 768] + - [92, 7316.0] + - - [768, 768, 1, 1280, 768, 768, 768, 768] + - [63, 15975.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [96, 17006.0] + - - [1024, 1024, 1, 120, 1024, 1024, 1024, 1024] + - [63, 12710.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [92, 324.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [107, 4810.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [107, 1279.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [107, 1829.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [61, 11367.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [63, 16798.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [87, 16720.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [62, 11001.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [63, 16648.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [88, 13981.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 448] + - [61, 12883.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [103, 17547.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [63, 16495.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [61, 12290.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [65, 10850.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [63, 16272.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [87, 9655.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [60, 7682.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [108, 8169.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [63, 13036.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [110, 12956.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [63, 12494.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [86, 14884.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [62, 7780.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [104, 11211.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [110, 15225.0] + - - [704, 1024, 1, 128, 704, 704, 704, 1024] + - [68, 10730.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [87, 12530.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [118, 15926.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [87, 9565.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [87, 12949.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [71, 15899.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 256] + - [63, 11012.0] + - - [448, 2944, 1, 128, 448, 448, 448, 2944] + - [86, 13998.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [89, 9179.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [62, 9384.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [109, 10970.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [65, 9756.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 448] + - [89, 8247.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [96, 14934.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [65, 9487.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [63, 12486.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [87, 9571.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 256] + - [63, 11514.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [109, 14485.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [62, 9050.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [86, 12581.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [72, 11286.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [87, 14144.0] + - - [448, 1856, 1, 128, 448, 448, 448, 1856] + - [86, 11852.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [63, 14231.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [63, 12880.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [71, 13870.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [64, 15537.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [74, 14883.0] + - - [704, 1856, 1, 128, 704, 704, 704, 1856] + - [61, 13937.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [71, 13051.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [72, 10893.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [63, 10212.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [63, 12740.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [88, 13034.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [63, 15253.0] + - - [448, 2368, 1, 128, 448, 448, 448, 2368] + - [109, 12982.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [63, 13851.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [61, 16337.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [62, 9205.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [84, 14074.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [61, 10844.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [85, 9328.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [88, 12899.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [87, 16187.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [63, 12414.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [96, 16558.0] + - - [448, 1024, 1, 128, 448, 448, 448, 1024] + - [62, 8270.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [61, 16044.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 64] + - [62, 6973.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [86, 12419.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [88, 13773.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [61, 10450.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [111, 16978.0] + - - [256, 1856, 1, 128, 256, 256, 256, 1856] + - [63, 8494.0] + - - [448, 1408, 1, 128, 448, 448, 448, 1408] + - [68, 9477.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [87, 12161.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [85, 8026.0] + - - [704, 1408, 1, 128, 704, 704, 704, 1408] + - [61, 12318.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 448] + - [61, 14022.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [62, 7907.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [69, 14583.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [87, 12030.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [112, 9539.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [96, 17021.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [61, 15845.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [109, 11861.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [87, 14892.0] + - - [256, 2368, 1, 128, 256, 256, 256, 2368] + - [63, 9237.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [86, 15040.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [110, 12805.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [61, 10528.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [87, 16077.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [87, 10962.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [88, 14636.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [61, 15040.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [63, 15422.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [63, 12109.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [63, 15375.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 448] + - [86, 11932.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [60, 8948.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [61, 9403.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [65, 9491.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [63, 11854.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [91, 16097.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [63, 10194.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [60, 9328.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [87, 11270.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [111, 16957.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [88, 15399.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 256] + - [62, 7715.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [85, 10611.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [60, 7698.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [87, 12235.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [72, 11312.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [87, 16725.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [63, 15157.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [63, 12897.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [114, 16520.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [85, 8121.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 704] + - [63, 10680.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [87, 13130.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [61, 11989.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 448] + - [62, 9477.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [87, 14883.0] + - - [704, 448, 1, 128, 704, 704, 704, 448] + - [62, 6889.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [110, 12835.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [62, 9348.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [68, 11049.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [61, 14022.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [111, 13047.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [63, 12446.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [91, 15220.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [111, 15492.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [61, 15209.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [61, 14508.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [89, 9200.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [77, 12670.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [112, 9769.0] + - - [64, 5888, 1, 128, 64, 64, 64, 5888] + - [62, 7959.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [63, 12986.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [108, 8042.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [72, 11308.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [87, 12871.0] + - - [704, 704, 1, 128, 704, 704, 704, 704] + - [62, 8885.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [86, 11525.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [62, 6643.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [90, 9677.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [93, 16368.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [88, 12925.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [91, 12332.0] + - - [256, 3584, 1, 128, 256, 256, 256, 3584] + - [63, 11582.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 256] + - [61, 13331.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 256] + - [62, 9129.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [63, 10707.0] + - - [256, 2944, 1, 128, 256, 256, 256, 2944] + - [63, 11165.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [65, 10856.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [61, 14508.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [93, 17554.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 704] + - [86, 13799.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [61, 9650.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [63, 10985.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [63, 11804.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [62, 8247.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [63, 10673.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [110, 16519.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [86, 16116.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 256] + - [62, 8494.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [61, 10986.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [61, 10106.0] + - - [64, 6784, 1, 128, 64, 64, 64, 6784] + - [73, 8008.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [117, 10854.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [62, 8247.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [60, 9047.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [110, 15223.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [89, 10563.0] + - - [64, 5056, 1, 128, 64, 64, 64, 5056] + - [62, 6949.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 64] + - [108, 7656.0] + - - [448, 704, 1, 128, 448, 448, 448, 704] + - [62, 6819.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 704] + - [63, 12342.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [110, 12221.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [69, 14800.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [88, 14556.0] + - - [256, 1408, 1, 128, 256, 256, 256, 1408] + - [108, 7588.0] + - - [256, 4288, 1, 128, 256, 256, 256, 4288] + - [63, 13537.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [63, 11038.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [88, 13385.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [72, 10854.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 64] + - [62, 7894.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [61, 13227.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [108, 8960.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [93, 14495.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [64, 12388.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [86, 16310.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [61, 10177.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [72, 10898.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [62, 6598.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [110, 14919.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [63, 14533.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [63, 12309.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [110, 13407.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [68, 10983.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [109, 13657.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [108, 9028.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [61, 16180.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [88, 13777.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [61, 14041.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [87, 14859.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [65, 9535.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [110, 16480.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [59, 9674.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [90, 10273.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [90, 10268.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [59, 6134.0] + - - [64, 1536, 64, 384, 64, 64, 64, 1536] + - [98, 12118.0] + - - [64, 1536, 64, 256, 64, 64, 64, 1536] + - [109, 18228.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [61, 10734.0] + - - [1024, 1024, 1, 3975, 1024, 1024, 1024, 1024] + - [93, 16909.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [61, 14091.0] + - - [64, 102, 624, 100, 64, 64, 64, 102] + - [61, 11974.0] + - - [64, 112, 576, 111, 64, 64, 64, 112] + - [86, 12859.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [109, 11892.0] + - - [64, 133, 480, 135, 64, 64, 64, 133] + - [61, 11944.0] + - - [1024, 1024, 1, 4026, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 160, 400, 159, 64, 64, 64, 160] + - [61, 14421.0] + - - [1024, 1024, 1, 3780, 1024, 1024, 1024, 1024] + - [96, 16963.0] + - - [64, 228, 272, 232, 64, 64, 64, 228] + - [61, 15644.0] + - - [1024, 1024, 1, 3822, 1024, 1024, 1024, 1024] + - [96, 16971.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [61, 9713.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [61, 14057.0] + - - [64, 135, 480, 134, 64, 64, 64, 135] + - [61, 11839.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [109, 11684.0] + - - [1024, 1024, 1, 3942, 1024, 1024, 1024, 1024] + - [96, 16992.0] + - - [1024, 1024, 1, 3861, 1024, 1024, 1024, 1024] + - [96, 16992.0] + - - [1024, 1024, 1, 4000, 1024, 1024, 1024, 1024] + - [96, 17078.0] + - - [1024, 1024, 1, 3870, 1024, 1024, 1024, 1024] + - [96, 16983.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [61, 8163.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [109, 11860.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [61, 15849.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [109, 12985.0] + - - [1024, 1024, 1, 4032, 1024, 1024, 1024, 1024] + - [74, 17048.0] + - - [1024, 1024, 1, 4012, 1024, 1024, 1024, 1024] + - [96, 17002.0] + - - [1024, 1024, 1, 3681, 1024, 1024, 1024, 1024] + - [96, 16957.0] + - - [1024, 1024, 1, 3927, 1024, 1024, 1024, 1024] + - [96, 16978.0] + - - [1024, 1024, 1, 3894, 1024, 1024, 1024, 1024] + - [96, 16989.0] + - - [64, 132, 480, 135, 64, 64, 64, 132] + - [61, 11539.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [109, 12082.0] + - - [1024, 1024, 1, 3876, 1024, 1024, 1024, 1024] + - [96, 16981.0] + - - [64, 84, 752, 85, 64, 64, 64, 84] + - [61, 10438.0] + - - [1024, 1024, 1, 4050, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [109, 11826.0] + - - [64, 99, 624, 102, 64, 64, 64, 99] + - [109, 11854.0] + - - [64, 143, 432, 148, 64, 64, 64, 143] + - [86, 12600.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 1024, 1024] + - [96, 17040.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [61, 14295.0] + - - [64, 148, 432, 147, 64, 64, 64, 148] + - [86, 13082.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [118, 16982.0] + - - [64, 123, 528, 122, 64, 64, 64, 123] + - [86, 14086.0] + - - [64, 102, 624, 101, 64, 64, 64, 102] + - [86, 11918.0] + - - [1024, 1024, 1, 3978, 1024, 1024, 1024, 1024] + - [96, 16994.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [61, 14315.0] + - - [1024, 1024, 1, 3995, 1024, 1024, 1024, 1024] + - [96, 16971.0] + - - [64, 132, 480, 134, 64, 64, 64, 132] + - [61, 11843.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [61, 12962.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [109, 11767.0] + - - [1024, 1024, 1, 3977, 1024, 1024, 1024, 1024] + - [96, 16996.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [86, 13167.0] + - - [64, 159, 400, 162, 64, 64, 64, 159] + - [61, 14054.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [109, 13787.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [109, 15608.0] + - - [1024, 1024, 1, 3925, 1024, 1024, 1024, 1024] + - [96, 16979.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [86, 11247.0] + - - [1024, 1024, 1, 3956, 1024, 1024, 1024, 1024] + - [96, 16987.0] + - - [1024, 1024, 1, 3976, 1024, 1024, 1024, 1024] + - [96, 16989.0] + - - [64, 111, 576, 112, 64, 64, 64, 111] + - [86, 13117.0] + - - [64, 100, 624, 102, 64, 64, 64, 100] + - [61, 11848.0] + - - [1024, 1024, 1, 3955, 1024, 1024, 1024, 1024] + - [96, 17010.0] + - - [1024, 1024, 1, 4030, 1024, 1024, 1024, 1024] + - [96, 17015.0] + - - [1024, 1024, 1, 3906, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 101, 624, 102, 64, 64, 64, 101] + - [61, 11816.0] + - - [1024, 1024, 1, 3796, 1024, 1024, 1024, 1024] + - [96, 16986.0] + - - [1024, 1024, 1, 3859, 1024, 1024, 1024, 1024] + - [96, 16982.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [109, 8829.0] + - - [1024, 1024, 1, 3860, 1024, 1024, 1024, 1024] + - [96, 16966.0] + - - [1024, 1024, 1, 4005, 1024, 1024, 1024, 1024] + - [96, 17004.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [86, 10423.0] + - - [1024, 1024, 1, 3990, 1024, 1024, 1024, 1024] + - [96, 16998.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [61, 11802.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [61, 9935.0] + - - [1024, 1024, 1, 3999, 1024, 1024, 1024, 1024] + - [96, 16993.0] + - - [1024, 1024, 1, 4020, 1024, 1024, 1024, 1024] + - [96, 16990.0] + - - [1024, 1024, 1, 3939, 1024, 1024, 1024, 1024] + - [96, 17001.0] + - - [64, 77, 816, 78, 64, 64, 64, 77] + - [109, 9705.0] + - - [1024, 1024, 1, 4059, 1024, 1024, 1024, 1024] + - [96, 17017.0] + - - [1024, 1024, 1, 3944, 1024, 1024, 1024, 1024] + - [96, 16995.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [61, 13166.0] + - - [1024, 1024, 1, 3720, 1024, 1024, 1024, 1024] + - [96, 16986.0] + - - [1024, 1024, 1, 3910, 1024, 1024, 1024, 1024] + - [96, 16997.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [86, 12399.0] + - - [64, 92, 688, 93, 64, 64, 64, 92] + - [109, 11280.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [86, 11877.0] + - - [1024, 1024, 1, 3969, 1024, 1024, 1024, 1024] + - [96, 17006.0] + - - [1024, 1024, 1, 3948, 1024, 1024, 1024, 1024] + - [96, 16990.0] + - - [1024, 1024, 1, 3996, 1024, 1024, 1024, 1024] + - [96, 17009.0] + - - [1024, 1024, 1, 3900, 1024, 1024, 1024, 1024] + - [96, 16998.0] + - - [1024, 1024, 1, 3640, 1024, 1024, 1024, 1024] + - [96, 16964.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [86, 12977.0] + - - [1024, 1024, 1, 3751, 1024, 1024, 1024, 1024] + - [96, 16978.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [86, 16004.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [61, 10474.0] + - - [1024, 1024, 1, 3712, 1024, 1024, 1024, 1024] + - [96, 17051.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [61, 12807.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [86, 16890.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [61, 15865.0] + - - [64, 192, 36, 25088, 64, 64, 64, 192] + - [73, 10415.0] + - - [128, 128, 64, 25, 128, 128, 128, 128] + - [75, 5160.0] + - - [64, 192, 64, 3200, 64, 64, 64, 192] + - [79, 11264.0] + - - [64, 128, 64, 23104, 64, 64, 64, 128] + - [76, 9224.0] + - - [128, 128, 64, 1600, 128, 128, 128, 128] + - [111, 16400.0] + - - [80, 192, 64, 4608, 80, 80, 80, 192] + - [123, 10407.0] + - - [64, 128, 36, 30, 64, 64, 64, 128] + - [95, 2528.0] + - - [64, 128, 64, 11552, 64, 64, 64, 128] + - [102, 9564.0] + - - [128, 192, 64, 946, 128, 128, 128, 192] + - [63, 16351.0] + - - [64, 192, 64, 12800, 64, 64, 64, 192] + - [113, 9738.0] + - - [224, 224, 64, 128, 224, 224, 224, 224] + - [86, 12255.0] + - - [128, 128, 64, 3360, 128, 128, 128, 128] + - [103, 15029.0] + - - [128, 128, 64, 420, 128, 128, 128, 128] + - [88, 14690.0] + - - [64, 128, 64, 361, 64, 64, 64, 128] + - [61, 11859.0] + - - [64, 128, 36, 53824, 64, 64, 64, 128] + - [124, 9545.0] + - - [128, 160, 36, 512, 128, 128, 128, 160] + - [64, 10439.0] + - - [147, 64, 36, 18816, 147, 147, 147, 64] + - [69, 9291.0] + - - [96, 128, 64, 946, 96, 96, 96, 128] + - [111, 11610.0] + - - [128, 128, 64, 50, 128, 128, 128, 128] + - [62, 8244.0] + - - [160, 224, 36, 128, 160, 160, 160, 224] + - [109, 11039.0] + - - [192, 224, 64, 1152, 192, 192, 192, 224] + - [109, 15408.0] + - - [128, 128, 36, 784, 128, 128, 128, 128] + - [87, 15272.0] + - - [96, 128, 64, 288, 96, 96, 96, 128] + - [85, 10623.0] + - - [128, 128, 64, 400, 128, 128, 128, 128] + - [111, 15353.0] + - - [128, 128, 64, 800, 128, 128, 128, 128] + - [111, 16151.0] + - - [96, 128, 36, 512, 96, 96, 96, 128] + - [61, 10554.0] + - - [96, 128, 64, 800, 96, 96, 96, 128] + - [103, 11764.0] + - - [192, 224, 64, 128, 192, 192, 192, 224] + - [109, 13489.0] + - - [128, 128, 64, 288, 128, 128, 128, 128] + - [88, 14660.0] + - - [96, 208, 36, 512, 96, 96, 96, 208] + - [63, 10235.0] + - - [64, 128, 36, 1568, 64, 64, 64, 128] + - [62, 12331.0] + - - [192, 192, 36, 512, 192, 192, 192, 192] + - [109, 15736.0] + - - [128, 128, 36, 512, 128, 128, 128, 128] + - [110, 14519.0] + - - [96, 208, 64, 1152, 96, 96, 96, 208] + - [116, 10614.0] + - - [128, 192, 64, 3200, 128, 128, 128, 192] + - [77, 15101.0] + - - [160, 160, 64, 288, 160, 160, 160, 160] + - [86, 11898.0] + - - [128, 128, 36, 440, 128, 128, 128, 128] + - [110, 14197.0] + - - [96, 128, 36, 1568, 96, 96, 96, 128] + - [110, 11829.0] + - - [112, 224, 36, 2048, 112, 112, 112, 224] + - [116, 14051.0] + - - [128, 128, 36, 7040, 128, 128, 128, 128] + - [125, 13779.0] + - - [128, 128, 36, 1568, 128, 128, 128, 128] + - [63, 15649.0] + - - [160, 224, 64, 128, 160, 160, 160, 224] + - [86, 10700.0] + - - [192, 224, 36, 2592, 192, 192, 192, 224] + - [61, 15615.0] + - - [64, 128, 64, 2888, 64, 64, 64, 128] + - [66, 13656.0] + - - [64, 128, 36, 480, 64, 64, 64, 128] + - [62, 10991.0] + - - [147, 64, 64, 9702, 147, 147, 147, 64] + - [109, 8768.0] + - - [64, 192, 64, 3698, 64, 64, 64, 192] + - [101, 10746.0] + - - [73, 192, 64, 10439, 73, 73, 73, 192] + - [81, 9604.0] + - - [128, 128, 36, 880, 128, 128, 128, 128] + - [87, 14796.0] + - - [192, 224, 36, 128, 192, 192, 192, 224] + - [86, 13427.0] + - - [64, 128, 36, 12544, 64, 64, 64, 128] + - [124, 9482.0] + - - [160, 160, 36, 512, 160, 160, 160, 160] + - [86, 10518.0] + - - [128, 128, 36, 3136, 128, 128, 128, 128] + - [71, 16548.0] + - - [112, 224, 36, 512, 112, 112, 112, 224] + - [61, 12655.0] + - - [128, 128, 36, 49, 128, 128, 128, 128] + - [67, 6229.0] + - - [112, 224, 64, 1152, 112, 112, 112, 224] + - [93, 13282.0] + - - [128, 192, 36, 1568, 128, 128, 128, 192] + - [63, 13779.0] + - - [128, 192, 36, 512, 128, 128, 128, 192] + - [87, 13115.0] + - - [192, 192, 64, 288, 192, 192, 192, 192] + - [61, 17797.0] + - - [96, 208, 64, 242, 96, 96, 96, 208] + - [71, 9640.0] + - - [64, 128, 64, 5776, 64, 64, 64, 128] + - [83, 9402.0] + - - [128, 192, 64, 288, 128, 128, 128, 192] + - [110, 15150.0] + - - [96, 128, 36, 6272, 96, 96, 96, 128] + - [103, 12147.0] + - - [96, 128, 64, 3200, 96, 96, 96, 128] + - [125, 12343.0] + - - [128, 192, 64, 800, 128, 128, 128, 192] + - [110, 16312.0] + - - [64, 128, 64, 10, 64, 64, 64, 128] + - [82, 1736.0] + - - [96, 208, 64, 288, 96, 96, 96, 208] + - [93, 9883.0] + - - [64, 128, 64, 160, 64, 64, 64, 128] + - [61, 9642.0] + - - [128, 128, 64, 1568, 128, 128, 128, 128] + - [88, 16578.0] + - - [112, 224, 64, 242, 112, 112, 112, 224] + - [61, 11971.0] + - - [160, 192, 64, 288, 160, 160, 160, 192] + - [61, 14662.0] + - - [128, 160, 64, 288, 128, 128, 128, 160] + - [110, 12981.0] + - - [128, 128, 64, 210, 128, 128, 128, 128] + - [64, 13884.0] + - - [73, 192, 36, 23360, 73, 73, 73, 192] + - [108, 8693.0] + - - [160, 192, 36, 512, 160, 160, 160, 192] + - [109, 12633.0] + - - [64, 128, 64, 722, 64, 64, 64, 128] + - [61, 13116.0] + - - [112, 224, 64, 288, 112, 112, 112, 224] + - [61, 12220.0] + - - [64, 192, 36, 6272, 64, 64, 64, 192] + - [73, 10298.0] + - - [64, 128, 36, 6272, 64, 64, 64, 128] + - [80, 11172.0] + - - [128, 128, 36, 3200, 128, 128, 128, 128] + - [88, 16193.0] + - - [128, 128, 36, 392, 128, 128, 128, 128] + - [87, 13945.0] + - - [80, 192, 36, 10368, 80, 80, 80, 192] + - [97, 9554.0] + - - [224, 224, 36, 128, 224, 224, 224, 224] + - [61, 11537.0] + - - [64, 128, 36, 784, 64, 64, 64, 128] + - [62, 11749.0] + - - [128, 128, 64, 200, 128, 128, 128, 128] + - [63, 13907.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [63, 17899.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [109, 17746.0] + - - [289, 1792, 1, 320, 289, 289, 289, 1792] + - [61, 10720.0] + - - [1001, 1024, 1, 32, 1001, 1001, 1001, 1024] + - [60, 6694.0] + - - [784, 400, 1, 32, 784, 784, 784, 400] + - [107, 3323.0] + - - [64, 1536, 32, 256, 64, 64, 64, 1536] + - [98, 16862.0] + - - [289, 2592, 1, 384, 289, 289, 289, 2592] + - [86, 12650.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [86, 17814.0] + - - [289, 2016, 1, 256, 289, 289, 289, 2016] + - [62, 8815.0] + - - [64, 1536, 32, 384, 64, 64, 64, 1536] + - [61, 17188.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [86, 16397.0] + - - [289, 3456, 1, 384, 289, 289, 289, 3456] + - [109, 13639.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [119, 16289.0] + - - [729, 1600, 1, 192, 729, 729, 729, 1600] + - [63, 12856.0] + - - [289, 1344, 1, 192, 289, 289, 289, 1344] + - [62, 8417.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [98, 17796.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [61, 15892.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [119, 15612.0] + - - [289, 1792, 1, 256, 289, 289, 289, 1792] + - [61, 10293.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [61, 17575.0] + - - [5329, 64, 128, 80, 5329, 5329, 5329, 64] + - [59, 6942.0] + - - [64, 1280, 128, 448, 64, 64, 64, 1280] + - [124, 10988.0] + - - [64, 2048, 128, 192, 64, 64, 64, 2048] + - [98, 9388.0] + - - [64, 1280, 128, 384, 64, 64, 64, 1280] + - [115, 10767.0] + - - [64, 1280, 128, 320, 64, 64, 64, 1280] + - [90, 10340.0] + - - [64, 1280, 128, 192, 64, 64, 64, 1280] + - [98, 11135.0] + - - [256, 4096, 1, 6400, 256, 256, 256, 4096] + - [127, 16977.0] + - - [512, 2048, 1, 3427, 512, 512, 512, 2048] + - [96, 16880.0] + - - [512, 2048, 1, 3552, 512, 512, 512, 2048] + - [96, 17007.0] + - - [512, 2048, 1, 3840, 512, 512, 512, 2048] + - [96, 17064.0] + - - [2048, 512, 1, 3427, 2048, 2048, 2048, 512] + - [118, 16888.0] + - - [2048, 512, 1, 3452, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3472, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3475, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [61, 10126.0] + - - [64, 64, 496, 65, 64, 64, 64, 64] + - [109, 10986.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [109, 7418.0] + - - [64, 71, 448, 71, 64, 64, 64, 71] + - [61, 7567.0] + - - [64, 77, 408, 77, 64, 64, 64, 77] + - [86, 7843.0] + - - [64, 77, 408, 78, 64, 64, 64, 77] + - [61, 7993.0] + - - [64, 78, 408, 78, 64, 64, 64, 78] + - [86, 8032.0] + - - [64, 85, 376, 85, 64, 64, 64, 85] + - [86, 8383.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [109, 9298.0] + - - [64, 112, 288, 112, 64, 64, 64, 112] + - [61, 11549.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [61, 11896.0] + - - [64, 123, 264, 122, 64, 64, 64, 123] + - [61, 11903.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [61, 11780.0] + - - [64, 134, 240, 134, 64, 64, 64, 134] + - [61, 10140.0] + - - [64, 135, 240, 134, 64, 64, 64, 135] + - [61, 10261.0] + - - [64, 135, 240, 135, 64, 64, 64, 135] + - [109, 10337.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [61, 18210.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [61, 18320.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [61, 18111.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [119, 13459.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [86, 18474.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [119, 10848.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [115, 10971.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [90, 11227.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [61, 17084.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [90, 10226.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [59, 7991.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [61, 12607.0] + - - [512, 2048, 1, 2790, 512, 512, 512, 2048] + - [118, 16856.0] + - - [512, 2048, 1, 2864, 512, 512, 512, 2048] + - [118, 16887.0] + - - [512, 2048, 1, 3092, 512, 512, 512, 2048] + - [118, 16906.0] + - - [512, 2048, 1, 3113, 512, 512, 512, 2048] + - [118, 16911.0] + - - [512, 2048, 1, 3137, 512, 512, 512, 2048] + - [96, 16896.0] + - - [512, 2048, 1, 3165, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3166, 512, 512, 512, 2048] + - [118, 16939.0] + - - [512, 2048, 1, 3194, 512, 512, 512, 2048] + - [118, 16923.0] + - - [512, 2048, 1, 3219, 512, 512, 512, 2048] + - [118, 16934.0] + - - [512, 2048, 1, 3222, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3234, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3237, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3242, 512, 512, 512, 2048] + - [118, 16941.0] + - - [512, 2048, 1, 3246, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3249, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3251, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3257, 512, 512, 512, 2048] + - [118, 16930.0] + - - [512, 2048, 1, 3262, 512, 512, 512, 2048] + - [118, 16933.0] + - - [512, 2048, 1, 3268, 512, 512, 512, 2048] + - [118, 16932.0] + - - [512, 2048, 1, 3282, 512, 512, 512, 2048] + - [96, 16936.0] + - - [512, 2048, 1, 3286, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3287, 512, 512, 512, 2048] + - [118, 16934.0] + - - [512, 2048, 1, 3293, 512, 512, 512, 2048] + - [96, 16933.0] + - - [512, 2048, 1, 3297, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3307, 512, 512, 512, 2048] + - [118, 16924.0] + - - [512, 2048, 1, 3314, 512, 512, 512, 2048] + - [96, 16956.0] + - - [512, 2048, 1, 3315, 512, 512, 512, 2048] + - [96, 16940.0] + - - [512, 2048, 1, 3319, 512, 512, 512, 2048] + - [96, 16950.0] + - - [512, 2048, 1, 3322, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3323, 512, 512, 512, 2048] + - [96, 16938.0] + - - [512, 2048, 1, 3324, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3325, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3327, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3329, 512, 512, 512, 2048] + - [96, 16927.0] + - - [512, 2048, 1, 3332, 512, 512, 512, 2048] + - [118, 16942.0] + - - [512, 2048, 1, 3336, 512, 512, 512, 2048] + - [118, 16941.0] + - - [512, 2048, 1, 3339, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3342, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3344, 512, 512, 512, 2048] + - [96, 16957.0] + - - [512, 2048, 1, 3358, 512, 512, 512, 2048] + - [118, 16954.0] + - - [512, 2048, 1, 3360, 512, 512, 512, 2048] + - [96, 17029.0] + - - [512, 2048, 1, 3364, 512, 512, 512, 2048] + - [118, 16949.0] + - - [512, 2048, 1, 3365, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3369, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3371, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3374, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3376, 512, 512, 512, 2048] + - [118, 16959.0] + - - [512, 2048, 1, 3377, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3378, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3381, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3382, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3383, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3384, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3385, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3386, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3388, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3390, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3391, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3396, 512, 512, 512, 2048] + - [118, 16960.0] + - - [512, 2048, 1, 3399, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3402, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3410, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3412, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3414, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3415, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3418, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3420, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3422, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3425, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3426, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3428, 512, 512, 512, 2048] + - [96, 16955.0] + - - [512, 2048, 1, 3430, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3431, 512, 512, 512, 2048] + - [118, 16959.0] + - - [512, 2048, 1, 3432, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3438, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3439, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3440, 512, 512, 512, 2048] + - [118, 16971.0] + - - [512, 2048, 1, 3443, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3445, 512, 512, 512, 2048] + - [118, 16953.0] + - - [512, 2048, 1, 3447, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3448, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3450, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3451, 512, 512, 512, 2048] + - [118, 16943.0] + - - [512, 2048, 1, 3452, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3453, 512, 512, 512, 2048] + - [96, 16951.0] + - - [512, 2048, 1, 3455, 512, 512, 512, 2048] + - [96, 16934.0] + - - [512, 2048, 1, 3456, 512, 512, 512, 2048] + - [96, 17029.0] + - - [512, 2048, 1, 3457, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3458, 512, 512, 512, 2048] + - [96, 16947.0] + - - [512, 2048, 1, 3459, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3460, 512, 512, 512, 2048] + - [118, 16936.0] + - - [512, 2048, 1, 3461, 512, 512, 512, 2048] + - [118, 16957.0] + - - [512, 2048, 1, 3462, 512, 512, 512, 2048] + - [118, 16955.0] + - - [512, 2048, 1, 3466, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3467, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3468, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3470, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3471, 512, 512, 512, 2048] + - [118, 16962.0] + - - [512, 2048, 1, 3472, 512, 512, 512, 2048] + - [118, 16954.0] + - - [512, 2048, 1, 3475, 512, 512, 512, 2048] + - [118, 16946.0] + - - [512, 2048, 1, 3476, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3477, 512, 512, 512, 2048] + - [96, 16958.0] + - - [512, 2048, 1, 3478, 512, 512, 512, 2048] + - [118, 16969.0] + - - [512, 2048, 1, 3479, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3480, 512, 512, 512, 2048] + - [118, 16952.0] + - - [512, 2048, 1, 3481, 512, 512, 512, 2048] + - [118, 16961.0] + - - [512, 2048, 1, 3483, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3484, 512, 512, 512, 2048] + - [118, 16970.0] + - - [512, 2048, 1, 3487, 512, 512, 512, 2048] + - [118, 16965.0] + - - [512, 2048, 1, 3489, 512, 512, 512, 2048] + - [118, 16958.0] + - - [512, 2048, 1, 3490, 512, 512, 512, 2048] + - [96, 16953.0] + - - [512, 2048, 1, 3491, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3493, 512, 512, 512, 2048] + - [118, 16947.0] + - - [512, 2048, 1, 3494, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3495, 512, 512, 512, 2048] + - [118, 16938.0] + - - [512, 2048, 1, 3497, 512, 512, 512, 2048] + - [96, 16950.0] + - - [512, 2048, 1, 3498, 512, 512, 512, 2048] + - [96, 16967.0] + - - [512, 2048, 1, 3499, 512, 512, 512, 2048] + - [96, 16970.0] + - - [512, 2048, 1, 3501, 512, 512, 512, 2048] + - [118, 16956.0] + - - [512, 2048, 1, 3503, 512, 512, 512, 2048] + - [118, 16944.0] + - - [512, 2048, 1, 3507, 512, 512, 512, 2048] + - [118, 16967.0] + - - [512, 2048, 1, 3508, 512, 512, 512, 2048] + - [118, 16972.0] + - - [512, 2048, 1, 3509, 512, 512, 512, 2048] + - [118, 16978.0] + - - [512, 2048, 1, 3511, 512, 512, 512, 2048] + - [118, 16972.0] + - - [512, 2048, 1, 3514, 512, 512, 512, 2048] + - [96, 16960.0] + - - [512, 2048, 1, 3515, 512, 512, 512, 2048] + - [118, 16974.0] + - - [512, 2048, 1, 3517, 512, 512, 512, 2048] + - [118, 16974.0] + - - [512, 2048, 1, 3518, 512, 512, 512, 2048] + - [96, 16967.0] + - - [512, 2048, 1, 3519, 512, 512, 512, 2048] + - [118, 16951.0] + - - [512, 2048, 1, 3520, 512, 512, 512, 2048] + - [96, 17028.0] + - - [512, 2048, 1, 3523, 512, 512, 512, 2048] + - [96, 16958.0] + - - [512, 2048, 1, 3528, 512, 512, 512, 2048] + - [96, 16974.0] + - - [512, 2048, 1, 3529, 512, 512, 512, 2048] + - [118, 16979.0] + - - [512, 2048, 1, 3530, 512, 512, 512, 2048] + - [118, 16978.0] + - - [512, 2048, 1, 3532, 512, 512, 512, 2048] + - [118, 16950.0] + - - [512, 2048, 1, 3533, 512, 512, 512, 2048] + - [96, 16975.0] + - - [512, 2048, 1, 3534, 512, 512, 512, 2048] + - [118, 16980.0] + - - [512, 2048, 1, 3538, 512, 512, 512, 2048] + - [118, 16963.0] + - - [512, 2048, 1, 3539, 512, 512, 512, 2048] + - [96, 16962.0] + - - [512, 2048, 1, 3541, 512, 512, 512, 2048] + - [96, 16975.0] + - - [512, 2048, 1, 3547, 512, 512, 512, 2048] + - [96, 16951.0] + - - [512, 2048, 1, 3548, 512, 512, 512, 2048] + - [118, 16966.0] + - - [512, 2048, 1, 3564, 512, 512, 512, 2048] + - [118, 16973.0] + - - [512, 2048, 1, 3575, 512, 512, 512, 2048] + - [118, 16979.0] + - - [512, 2048, 1, 3598, 512, 512, 512, 2048] + - [96, 16973.0] + - - [512, 2048, 1, 3599, 512, 512, 512, 2048] + - [118, 16990.0] + - - [512, 2048, 1, 3608, 512, 512, 512, 2048] + - [118, 16985.0] + - - [512, 2048, 1, 3780, 512, 512, 512, 2048] + - [74, 16957.0] + - - [512, 2048, 1, 3796, 512, 512, 512, 2048] + - [118, 17007.0] + - - [512, 2048, 1, 3822, 512, 512, 512, 2048] + - [96, 16973.0] + - - [512, 2048, 1, 3859, 512, 512, 512, 2048] + - [118, 16980.0] + - - [512, 2048, 1, 3870, 512, 512, 512, 2048] + - [118, 17006.0] + - - [512, 2048, 1, 3876, 512, 512, 512, 2048] + - [96, 17007.0] + - - [512, 2048, 1, 3906, 512, 512, 512, 2048] + - [118, 17014.0] + - - [512, 2048, 1, 3910, 512, 512, 512, 2048] + - [118, 17023.0] + - - [512, 2048, 1, 3925, 512, 512, 512, 2048] + - [118, 16990.0] + - - [512, 2048, 1, 3942, 512, 512, 512, 2048] + - [118, 17009.0] + - - [512, 2048, 1, 3944, 512, 512, 512, 2048] + - [118, 16985.0] + - - [512, 2048, 1, 3955, 512, 512, 512, 2048] + - [118, 17014.0] + - - [512, 2048, 1, 3968, 512, 512, 512, 2048] + - [96, 17082.0] + - - [512, 2048, 1, 3969, 512, 512, 512, 2048] + - [118, 17008.0] + - - [512, 2048, 1, 3976, 512, 512, 512, 2048] + - [118, 17009.0] + - - [512, 2048, 1, 3977, 512, 512, 512, 2048] + - [118, 17028.0] + - - [512, 2048, 1, 3978, 512, 512, 512, 2048] + - [118, 17017.0] + - - [512, 2048, 1, 3990, 512, 512, 512, 2048] + - [96, 17006.0] + - - [512, 2048, 1, 3995, 512, 512, 512, 2048] + - [118, 17016.0] + - - [512, 2048, 1, 3996, 512, 512, 512, 2048] + - [118, 16996.0] + - - [512, 2048, 1, 3999, 512, 512, 512, 2048] + - [118, 17010.0] + - - [512, 2048, 1, 4005, 512, 512, 512, 2048] + - [118, 17012.0] + - - [512, 2048, 1, 4012, 512, 512, 512, 2048] + - [96, 16996.0] + - - [512, 2048, 1, 4020, 512, 512, 512, 2048] + - [96, 16998.0] + - - [512, 2048, 1, 4026, 512, 512, 512, 2048] + - [96, 17002.0] + - - [512, 2048, 1, 4030, 512, 512, 512, 2048] + - [96, 17017.0] + - - [512, 2048, 1, 4032, 512, 512, 512, 2048] + - [96, 17062.0] + - - [2048, 512, 1, 2790, 2048, 2048, 2048, 512] + - [118, 16852.0] + - - [2048, 512, 1, 2864, 2048, 2048, 2048, 512] + - [118, 16908.0] + - - [2048, 512, 1, 3092, 2048, 2048, 2048, 512] + - [118, 16938.0] + - - [2048, 512, 1, 3113, 2048, 2048, 2048, 512] + - [96, 16915.0] + - - [2048, 512, 1, 3137, 2048, 2048, 2048, 512] + - [96, 16891.0] + - - [2048, 512, 1, 3165, 2048, 2048, 2048, 512] + - [118, 16922.0] + - - [2048, 512, 1, 3166, 2048, 2048, 2048, 512] + - [118, 16939.0] + - - [2048, 512, 1, 3194, 2048, 2048, 2048, 512] + - [96, 16932.0] + - - [2048, 512, 1, 3219, 2048, 2048, 2048, 512] + - [118, 16921.0] + - - [2048, 512, 1, 3222, 2048, 2048, 2048, 512] + - [118, 16915.0] + - - [2048, 512, 1, 3234, 2048, 2048, 2048, 512] + - [118, 16937.0] + - - [2048, 512, 1, 3237, 2048, 2048, 2048, 512] + - [118, 16934.0] + - - [2048, 512, 1, 3242, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3246, 2048, 2048, 2048, 512] + - [118, 16942.0] + - - [2048, 512, 1, 3249, 2048, 2048, 2048, 512] + - [118, 16939.0] + - - [2048, 512, 1, 3251, 2048, 2048, 2048, 512] + - [118, 16941.0] + - - [2048, 512, 1, 3257, 2048, 2048, 2048, 512] + - [118, 16927.0] + - - [2048, 512, 1, 3262, 2048, 2048, 2048, 512] + - [96, 16926.0] + - - [2048, 512, 1, 3268, 2048, 2048, 2048, 512] + - [118, 16931.0] + - - [2048, 512, 1, 3282, 2048, 2048, 2048, 512] + - [118, 16923.0] + - - [2048, 512, 1, 3286, 2048, 2048, 2048, 512] + - [118, 16947.0] + - - [2048, 512, 1, 3287, 2048, 2048, 2048, 512] + - [118, 16934.0] + - - [2048, 512, 1, 3293, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3297, 2048, 2048, 2048, 512] + - [96, 16940.0] + - - [2048, 512, 1, 3307, 2048, 2048, 2048, 512] + - [96, 16924.0] + - - [2048, 512, 1, 3314, 2048, 2048, 2048, 512] + - [96, 16935.0] + - - [2048, 512, 1, 3315, 2048, 2048, 2048, 512] + - [118, 16917.0] + - - [2048, 512, 1, 3319, 2048, 2048, 2048, 512] + - [96, 16932.0] + - - [2048, 512, 1, 3322, 2048, 2048, 2048, 512] + - [96, 16937.0] + - - [2048, 512, 1, 3323, 2048, 2048, 2048, 512] + - [118, 16949.0] + - - [2048, 512, 1, 3324, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3325, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3327, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3329, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3332, 2048, 2048, 2048, 512] + - [96, 16944.0] + - - [2048, 512, 1, 3336, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3339, 2048, 2048, 2048, 512] + - [118, 16947.0] + - - [2048, 512, 1, 3342, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3344, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3358, 2048, 2048, 2048, 512] + - [118, 16943.0] + - - [2048, 512, 1, 3360, 2048, 2048, 2048, 512] + - [96, 17017.0] + - - [2048, 512, 1, 3364, 2048, 2048, 2048, 512] + - [118, 16965.0] + - - [2048, 512, 1, 3365, 2048, 2048, 2048, 512] + - [118, 16967.0] + - - [2048, 512, 1, 3369, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3371, 2048, 2048, 2048, 512] + - [118, 16955.0] + - - [2048, 512, 1, 3374, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3376, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3377, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3378, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3381, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3382, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3383, 2048, 2048, 2048, 512] + - [96, 16958.0] + - - [2048, 512, 1, 3384, 2048, 2048, 2048, 512] + - [118, 16950.0] + - - [2048, 512, 1, 3385, 2048, 2048, 2048, 512] + - [118, 16946.0] + - - [2048, 512, 1, 3386, 2048, 2048, 2048, 512] + - [96, 16949.0] + - - [2048, 512, 1, 3388, 2048, 2048, 2048, 512] + - [118, 16974.0] + - - [2048, 512, 1, 3390, 2048, 2048, 2048, 512] + - [96, 16964.0] + - - [2048, 512, 1, 3391, 2048, 2048, 2048, 512] + - [118, 16951.0] + - - [2048, 512, 1, 3396, 2048, 2048, 2048, 512] + - [118, 16971.0] + - - [2048, 512, 1, 3399, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3402, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3410, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3412, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3414, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3415, 2048, 2048, 2048, 512] + - [118, 16982.0] + - - [2048, 512, 1, 3418, 2048, 2048, 2048, 512] + - [118, 16965.0] + - - [2048, 512, 1, 3420, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3422, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3425, 2048, 2048, 2048, 512] + - [96, 16960.0] + - - [2048, 512, 1, 3426, 2048, 2048, 2048, 512] + - [118, 16961.0] + - - [2048, 512, 1, 3428, 2048, 2048, 2048, 512] + - [96, 16955.0] + - - [2048, 512, 1, 3430, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3431, 2048, 2048, 2048, 512] + - [118, 16948.0] + - - [2048, 512, 1, 3432, 2048, 2048, 2048, 512] + - [118, 16956.0] + - - [2048, 512, 1, 3438, 2048, 2048, 2048, 512] + - [118, 16979.0] + - - [2048, 512, 1, 3439, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3440, 2048, 2048, 2048, 512] + - [96, 16947.0] + - - [2048, 512, 1, 3443, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3445, 2048, 2048, 2048, 512] + - [118, 16955.0] + - - [2048, 512, 1, 3447, 2048, 2048, 2048, 512] + - [118, 16963.0] + - - [2048, 512, 1, 3448, 2048, 2048, 2048, 512] + - [118, 16971.0] + - - [2048, 512, 1, 3450, 2048, 2048, 2048, 512] + - [118, 16960.0] + - - [2048, 512, 1, 3451, 2048, 2048, 2048, 512] + - [96, 16962.0] + - - [2048, 512, 1, 3453, 2048, 2048, 2048, 512] + - [118, 16962.0] + - - [2048, 512, 1, 3455, 2048, 2048, 2048, 512] + - [118, 16973.0] + - - [2048, 512, 1, 3456, 2048, 2048, 2048, 512] + - [96, 17026.0] + - - [2048, 512, 1, 3457, 2048, 2048, 2048, 512] + - [118, 16950.0] + - - [2048, 512, 1, 3458, 2048, 2048, 2048, 512] + - [96, 16933.0] + - - [2048, 512, 1, 3459, 2048, 2048, 2048, 512] + - [118, 16949.0] + - - [2048, 512, 1, 3460, 2048, 2048, 2048, 512] + - [96, 16952.0] + - - [2048, 512, 1, 3461, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3462, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3466, 2048, 2048, 2048, 512] + - [96, 16942.0] + - - [2048, 512, 1, 3467, 2048, 2048, 2048, 512] + - [118, 16958.0] + - - [2048, 512, 1, 3468, 2048, 2048, 2048, 512] + - [96, 16955.0] + - - [2048, 512, 1, 3470, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3471, 2048, 2048, 2048, 512] + - [118, 16966.0] + - - [2048, 512, 1, 3476, 2048, 2048, 2048, 512] + - [118, 16961.0] + - - [2048, 512, 1, 3477, 2048, 2048, 2048, 512] + - [118, 16964.0] + - - [2048, 512, 1, 3478, 2048, 2048, 2048, 512] + - [118, 16936.0] + - - [2048, 512, 1, 3479, 2048, 2048, 2048, 512] + - [118, 16964.0] + - - [2048, 512, 1, 3480, 2048, 2048, 2048, 512] + - [96, 16946.0] + - - [2048, 512, 1, 3481, 2048, 2048, 2048, 512] + - [96, 16950.0] + - - [2048, 512, 1, 3483, 2048, 2048, 2048, 512] + - [96, 16962.0] + - - [2048, 512, 1, 3484, 2048, 2048, 2048, 512] + - [96, 16957.0] + - - [2048, 512, 1, 3487, 2048, 2048, 2048, 512] + - [118, 16973.0] + - - [2048, 512, 1, 3489, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3490, 2048, 2048, 2048, 512] + - [118, 16967.0] + - - [2048, 512, 1, 3491, 2048, 2048, 2048, 512] + - [118, 16958.0] + - - [2048, 512, 1, 3493, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3494, 2048, 2048, 2048, 512] + - [118, 16940.0] + - - [2048, 512, 1, 3495, 2048, 2048, 2048, 512] + - [118, 16954.0] + - - [2048, 512, 1, 3497, 2048, 2048, 2048, 512] + - [96, 16984.0] + - - [2048, 512, 1, 3498, 2048, 2048, 2048, 512] + - [96, 16956.0] + - - [2048, 512, 1, 3499, 2048, 2048, 2048, 512] + - [96, 16975.0] + - - [2048, 512, 1, 3501, 2048, 2048, 2048, 512] + - [96, 16967.0] + - - [2048, 512, 1, 3503, 2048, 2048, 2048, 512] + - [118, 16960.0] + - - [2048, 512, 1, 3507, 2048, 2048, 2048, 512] + - [96, 16956.0] + - - [2048, 512, 1, 3508, 2048, 2048, 2048, 512] + - [96, 16972.0] + - - [2048, 512, 1, 3509, 2048, 2048, 2048, 512] + - [96, 16965.0] + - - [2048, 512, 1, 3511, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3514, 2048, 2048, 2048, 512] + - [96, 16966.0] + - - [2048, 512, 1, 3515, 2048, 2048, 2048, 512] + - [96, 16969.0] + - - [2048, 512, 1, 3517, 2048, 2048, 2048, 512] + - [118, 16977.0] + - - [2048, 512, 1, 3518, 2048, 2048, 2048, 512] + - [118, 16976.0] + - - [2048, 512, 1, 3519, 2048, 2048, 2048, 512] + - [118, 16970.0] + - - [2048, 512, 1, 3520, 2048, 2048, 2048, 512] + - [96, 17044.0] + - - [2048, 512, 1, 3523, 2048, 2048, 2048, 512] + - [96, 16972.0] + - - [2048, 512, 1, 3528, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3529, 2048, 2048, 2048, 512] + - [118, 16978.0] + - - [2048, 512, 1, 3530, 2048, 2048, 2048, 512] + - [96, 16979.0] + - - [2048, 512, 1, 3532, 2048, 2048, 2048, 512] + - [96, 16969.0] + - - [2048, 512, 1, 3533, 2048, 2048, 2048, 512] + - [118, 16959.0] + - - [2048, 512, 1, 3534, 2048, 2048, 2048, 512] + - [118, 16975.0] + - - [2048, 512, 1, 3538, 2048, 2048, 2048, 512] + - [118, 16983.0] + - - [2048, 512, 1, 3539, 2048, 2048, 2048, 512] + - [118, 16982.0] + - - [2048, 512, 1, 3541, 2048, 2048, 2048, 512] + - [118, 16999.0] + - - [2048, 512, 1, 3547, 2048, 2048, 2048, 512] + - [118, 16997.0] + - - [2048, 512, 1, 3548, 2048, 2048, 2048, 512] + - [118, 16957.0] + - - [2048, 512, 1, 3552, 2048, 2048, 2048, 512] + - [96, 17040.0] + - - [2048, 512, 1, 3564, 2048, 2048, 2048, 512] + - [118, 16999.0] + - - [2048, 512, 1, 3575, 2048, 2048, 2048, 512] + - [118, 16976.0] + - - [2048, 512, 1, 3598, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3599, 2048, 2048, 2048, 512] + - [118, 16972.0] + - - [2048, 512, 1, 3608, 2048, 2048, 2048, 512] + - [118, 16991.0] + - - [2048, 512, 1, 3780, 2048, 2048, 2048, 512] + - [96, 16997.0] + - - [2048, 512, 1, 3796, 2048, 2048, 2048, 512] + - [96, 17003.0] + - - [2048, 512, 1, 3822, 2048, 2048, 2048, 512] + - [118, 16983.0] + - - [2048, 512, 1, 3840, 2048, 2048, 2048, 512] + - [96, 17053.0] + - - [2048, 512, 1, 3859, 2048, 2048, 2048, 512] + - [96, 17019.0] + - - [2048, 512, 1, 3870, 2048, 2048, 2048, 512] + - [96, 17003.0] + - - [2048, 512, 1, 3876, 2048, 2048, 2048, 512] + - [118, 16994.0] + - - [2048, 512, 1, 3906, 2048, 2048, 2048, 512] + - [96, 16998.0] + - - [2048, 512, 1, 3910, 2048, 2048, 2048, 512] + - [118, 17025.0] + - - [2048, 512, 1, 3925, 2048, 2048, 2048, 512] + - [118, 17014.0] + - - [2048, 512, 1, 3942, 2048, 2048, 2048, 512] + - [118, 16998.0] + - - [2048, 512, 1, 3944, 2048, 2048, 2048, 512] + - [118, 17015.0] + - - [2048, 512, 1, 3955, 2048, 2048, 2048, 512] + - [118, 17010.0] + - - [2048, 512, 1, 3968, 2048, 2048, 2048, 512] + - [96, 17093.0] + - - [2048, 512, 1, 3969, 2048, 2048, 2048, 512] + - [118, 17031.0] + - - [2048, 512, 1, 3976, 2048, 2048, 2048, 512] + - [118, 17006.0] + - - [2048, 512, 1, 3977, 2048, 2048, 2048, 512] + - [118, 17010.0] + - - [2048, 512, 1, 3978, 2048, 2048, 2048, 512] + - [118, 17023.0] + - - [2048, 512, 1, 3990, 2048, 2048, 2048, 512] + - [118, 17006.0] + - - [2048, 512, 1, 3995, 2048, 2048, 2048, 512] + - [118, 17019.0] + - - [2048, 512, 1, 3996, 2048, 2048, 2048, 512] + - [118, 17012.0] + - - [2048, 512, 1, 3999, 2048, 2048, 2048, 512] + - [118, 17017.0] + - - [2048, 512, 1, 4005, 2048, 2048, 2048, 512] + - [118, 16997.0] + - - [2048, 512, 1, 4012, 2048, 2048, 2048, 512] + - [118, 17026.0] + - - [2048, 512, 1, 4020, 2048, 2048, 2048, 512] + - [118, 17022.0] + - - [2048, 512, 1, 4026, 2048, 2048, 2048, 512] + - [118, 17016.0] + - - [2048, 512, 1, 4030, 2048, 2048, 2048, 512] + - [96, 17016.0] + - - [2048, 512, 1, 4032, 2048, 2048, 2048, 512] + - [96, 17093.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [109, 10980.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [61, 12770.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [109, 17200.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [127, 16972.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [96, 17076.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [96, 17168.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [96, 17193.0] + - - [768, 768, 1, 384, 768, 768, 768, 768] + - [63, 14050.0] + - - [768, 384, 1, 384, 768, 768, 768, 384] + - [120, 10447.0] + - - [1152, 576, 1, 384, 1152, 1152, 1152, 576] + - [110, 12157.0] + - - [384, 768, 1, 384, 384, 384, 384, 768] + - [62, 10584.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [61, 6792.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [61, 16756.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [86, 16328.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [96, 16677.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [63, 13959.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [65, 10012.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [88, 14765.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [61, 15021.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [61, 10699.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [93, 16363.0] + - - [2304, 128, 1, 128, 2304, 2304, 2304, 128] + - [62, 7204.0] + - - [2688, 128, 1, 128, 2688, 2688, 2688, 128] + - [62, 7414.0] + - - [3072, 128, 1, 128, 3072, 3072, 3072, 128] + - [60, 7914.0] + - - [3456, 128, 1, 128, 3456, 3456, 3456, 128] + - [73, 8043.0] + - - [3840, 128, 1, 128, 3840, 3840, 3840, 128] + - [63, 8738.0] + - - [4224, 128, 1, 128, 4224, 4224, 4224, 128] + - [63, 9585.0] + - - [4608, 128, 1, 128, 4608, 4608, 4608, 128] + - [61, 10371.0] + - - [4992, 128, 1, 128, 4992, 4992, 4992, 128] + - [62, 9577.0] + - - [5376, 128, 1, 128, 5376, 5376, 5376, 128] + - [63, 10194.0] + - - [5760, 128, 1, 128, 5760, 5760, 5760, 128] + - [63, 10822.0] + - - [6144, 128, 1, 128, 6144, 6144, 6144, 128] + - [61, 11362.0] + - - [6528, 128, 1, 128, 6528, 6528, 6528, 128] + - [63, 12099.0] + - - [6912, 128, 1, 128, 6912, 6912, 6912, 128] + - [68, 10995.0] + - - [7296, 128, 1, 128, 7296, 7296, 7296, 128] + - [63, 11628.0] + - - [7680, 128, 1, 128, 7680, 7680, 7680, 128] + - [61, 11984.0] + - - [8064, 128, 1, 128, 8064, 8064, 8064, 128] + - [87, 12607.0] + - - [8448, 128, 1, 128, 8448, 8448, 8448, 128] + - [63, 13009.0] + - - [8832, 128, 1, 128, 8832, 8832, 8832, 128] + - [63, 13600.0] + - - [2304, 128, 1, 256, 2304, 2304, 2304, 128] + - [62, 9207.0] + - - [2688, 128, 1, 256, 2688, 2688, 2688, 128] + - [60, 8635.0] + - - [3072, 128, 1, 256, 3072, 3072, 3072, 128] + - [62, 9478.0] + - - [3456, 128, 1, 256, 3456, 3456, 3456, 128] + - [63, 9882.0] + - - [3840, 128, 1, 256, 3840, 3840, 3840, 128] + - [63, 10942.0] + - - [4224, 128, 1, 256, 4224, 4224, 4224, 128] + - [61, 11912.0] + - - [4608, 128, 1, 256, 4608, 4608, 4608, 128] + - [63, 12862.0] + - - [4992, 128, 1, 256, 4992, 4992, 4992, 128] + - [87, 11113.0] + - - [5376, 128, 1, 256, 5376, 5376, 5376, 128] + - [63, 11791.0] + - - [5760, 128, 1, 256, 5760, 5760, 5760, 128] + - [87, 12600.0] + - - [6144, 128, 1, 256, 6144, 6144, 6144, 128] + - [63, 13263.0] + - - [6528, 128, 1, 256, 6528, 6528, 6528, 128] + - [63, 14185.0] + - - [6912, 128, 1, 256, 6912, 6912, 6912, 128] + - [87, 12404.0] + - - [7296, 128, 1, 256, 7296, 7296, 7296, 128] + - [78, 12993.0] + - - [7680, 128, 1, 256, 7680, 7680, 7680, 128] + - [63, 13545.0] + - - [8064, 128, 1, 256, 8064, 8064, 8064, 128] + - [78, 14314.0] + - - [8448, 128, 1, 256, 8448, 8448, 8448, 128] + - [63, 14819.0] + - - [8832, 128, 1, 256, 8832, 8832, 8832, 128] + - [88, 15610.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [63, 15397.0] + - - [384, 1536, 1, 384, 384, 384, 384, 1536] + - [63, 13981.0] + - - [384, 1920, 1, 384, 384, 384, 384, 1920] + - [63, 13495.0] + - - [384, 2304, 1, 384, 384, 384, 384, 2304] + - [88, 13057.0] + - - [64, 192, 64, 1280, 64, 64, 64, 192] + - [109, 14917.0] + - - [64, 320, 64, 1280, 64, 64, 64, 320] + - [101, 15333.0] + - - [64, 384, 64, 1280, 64, 64, 64, 384] + - [109, 15585.0] + - - [64, 448, 64, 1280, 64, 64, 64, 448] + - [119, 14815.0] + - - [64, 192, 64, 2048, 64, 64, 64, 192] + - [113, 14787.0] + - - [64, 320, 64, 2048, 64, 64, 64, 320] + - [76, 12161.0] + - - [64, 384, 64, 2048, 64, 64, 64, 384] + - [76, 12047.0] + - - [64, 448, 64, 2048, 64, 64, 64, 448] + - [76, 12114.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 64] + - [110, 17339.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 64] + - [63, 17910.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 64] + - [110, 18027.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 80] + - [59, 8145.0] + - - [64, 192, 32, 1280, 64, 64, 64, 192] + - [62, 10985.0] + - - [64, 320, 32, 1280, 64, 64, 64, 320] + - [109, 12675.0] + - - [64, 384, 32, 1280, 64, 64, 64, 384] + - [86, 14975.0] + - - [64, 448, 32, 1280, 64, 64, 64, 448] + - [66, 13771.0] + - - [64, 192, 32, 2048, 64, 64, 64, 192] + - [94, 11567.0] + - - [64, 320, 32, 2048, 64, 64, 64, 320] + - [109, 12839.0] + - - [64, 384, 32, 2048, 64, 64, 64, 384] + - [86, 15163.0] + - - [64, 448, 32, 2048, 64, 64, 64, 448] + - [113, 13692.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 64] + - [110, 15887.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 64] + - [110, 16919.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 64] + - [110, 17049.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 80] + - [63, 11046.0] + - - [289, 128, 32, 768, 289, 289, 289, 128] + - [109, 14161.0] + - - [289, 160, 32, 768, 289, 289, 289, 160] + - [86, 13023.0] + - - [289, 192, 32, 768, 289, 289, 289, 192] + - [109, 15563.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [86, 16170.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [98, 18441.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [116, 13149.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [74, 17165.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [88, 15001.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [93, 13915.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [93, 15208.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [116, 14171.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [61, 13275.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [61, 9216.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [61, 8732.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [61, 10625.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [63, 17946.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [86, 11278.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [86, 13943.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [64, 12394.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [64, 15110.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [111, 15092.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [61, 10513.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [61, 13190.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [66, 2834.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [61, 16829.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [61, 7002.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [88, 16157.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [96, 17142.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [88, 14924.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [76, 16998.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [61, 8004.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [88, 16251.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [74, 17133.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [109, 16934.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [61, 7362.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [88, 16156.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [96, 17151.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [86, 17336.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [107, 1628.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [88, 15327.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [96, 16919.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [100, 15418.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [61, 12224.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [107, 2913.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [61, 13508.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [61, 13222.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [108, 2636.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [63, 13657.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [93, 16480.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [106, 17163.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [96, 17190.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [106, 17153.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [107, 326.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [92, 4934.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [123, 10721.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [83, 10725.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [86, 13839.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [119, 14045.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [110, 15336.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [86, 15420.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [87, 14402.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [63, 14397.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [71, 14907.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [71, 14945.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [105, 12482.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [68, 12369.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [123, 16423.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [61, 16217.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [99, 12387.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [86, 12248.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [69, 12296.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [68, 12218.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [108, 8582.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [61, 17562.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [110, 18028.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [127, 16813.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [88, 16294.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [88, 14935.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [63, 13870.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [107, 301.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [107, 362.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [63, 13741.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [63, 12677.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [59, 9582.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [76, 8096.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [88, 8994.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [119, 7798.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [79, 7460.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [107, 2926.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [127, 17008.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [102, 10793.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [81, 10752.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [122, 11073.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [61, 17581.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [110, 8825.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [86, 10906.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [59, 6781.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [95, 6451.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [121, 6671.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [124, 6537.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [61, 10251.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [68, 6065.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [68, 6022.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [68, 6083.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [68, 6053.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [61, 12840.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [61, 14000.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [63, 16332.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [109, 17971.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [110, 18595.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [88, 16740.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [86, 15595.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [87, 16216.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [61, 14847.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [110, 14749.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [86, 16636.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [99, 16556.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [127, 16898.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [63, 15747.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [87, 14554.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [63, 13746.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [87, 13498.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [87, 14828.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [88, 16372.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [63, 15259.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [71, 14096.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [125, 14394.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [61, 15804.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [93, 13833.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [71, 13673.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [67, 531.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [95, 1779.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [70, 499.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [59, 1826.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [107, 1843.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [75, 592.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [82, 1933.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [59, 2042.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [107, 557.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [107, 517.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [60, 9560.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [116, 15091.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [93, 15056.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [125, 14644.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [71, 14444.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [116, 14925.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [71, 14841.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [63, 14737.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [87, 14728.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [87, 13945.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [87, 13928.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [116, 14488.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [71, 14336.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [71, 12850.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [63, 12616.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [87, 13634.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [63, 13764.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [72, 10741.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [115, 10797.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [110, 18138.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [126, 12573.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [61, 12475.0] + - - [64, 64, 64, 13216, 64, 64, 64, 64] + - [80, 6971.0] + - - [64, 96, 36, 10368, 64, 64, 64, 96] + - [80, 8398.0] + - - [64, 64, 36, 12544, 64, 64, 64, 64] + - [132, 7460.0] + - - [64, 64, 36, 11552, 64, 64, 64, 64] + - [130, 7811.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [128, 15060.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [128, 15329.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [128, 15287.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [128, 15552.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [128, 15605.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [128, 15678.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [128, 15686.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [128, 15654.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [128, 15732.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [128, 15785.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [128, 15799.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [128, 15846.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [129, 15857.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [128, 15875.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [128, 14872.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [128, 14974.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [128, 15138.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [128, 15192.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [128, 14272.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [131, 8465.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [161, 6139.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [136, 3663.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [147, 6759.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 64] + - [135, 3068.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [165, 4917.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [136, 6200.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [165, 6481.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [192, 4946.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [169, 9514.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [136, 7303.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [147, 8018.0] + - - [704, 256, 1, 128, 704, 704, 704, 256] + - [184, 4846.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [184, 4887.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [169, 9133.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [136, 6918.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [136, 6638.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [165, 7712.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [165, 6475.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [186, 7005.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [165, 7097.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [192, 6758.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [152, 3866.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [165, 4946.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [169, 8308.0] + - - [64, 2944, 1, 128, 64, 64, 64, 2944] + - [136, 5067.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [169, 8798.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [134, 7634.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [169, 7977.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [201, 4449.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [184, 5430.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [192, 5330.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [182, 5976.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 64] + - [186, 6184.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [165, 6746.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [134, 6554.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [151, 5105.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [136, 4806.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 256] + - [136, 5887.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 64] + - [136, 5004.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [173, 4485.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [168, 3721.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [165, 6268.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [196, 7973.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [138, 6194.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 64] + - [136, 5187.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [147, 8743.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [169, 7698.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [196, 8291.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [156, 8270.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [207, 9532.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [136, 4459.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [136, 5930.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 64] + - [136, 4127.0] + - - [64, 1408, 1, 128, 64, 64, 64, 1408] + - [135, 3084.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [134, 7999.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [136, 6216.0] + - - [448, 256, 1, 128, 448, 448, 448, 256] + - [152, 3529.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [192, 5325.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [135, 3051.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [165, 6247.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [182, 7705.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [147, 6883.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 64] + - [152, 3503.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [135, 3068.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [182, 8027.0] + - - [256, 448, 1, 128, 256, 256, 256, 448] + - [168, 3291.0] + - - [64, 3584, 1, 128, 64, 64, 64, 3584] + - [186, 5224.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [202, 4863.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [169, 7800.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [134, 7984.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [135, 3757.0] + - - [64, 1856, 1, 128, 64, 64, 64, 1856] + - [151, 3440.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [151, 5187.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [192, 7688.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [138, 5489.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [196, 6484.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [165, 7133.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [169, 9167.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [165, 5310.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [147, 7790.0] + - - [256, 704, 1, 128, 256, 256, 256, 704] + - [160, 4727.0] + - - [256, 1024, 1, 128, 256, 256, 256, 1024] + - [161, 5971.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [192, 4938.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [147, 8319.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [142, 4964.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [169, 8075.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [134, 5946.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [147, 9120.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [165, 6987.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [173, 4660.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [141, 3757.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [147, 7739.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [136, 6079.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [165, 6987.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [169, 7932.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [165, 5323.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [186, 6388.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [186, 7349.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [182, 7690.0] + - - [64, 2368, 1, 128, 64, 64, 64, 2368] + - [173, 4254.0] + - - [64, 4288, 1, 128, 64, 64, 64, 4288] + - [186, 6228.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [138, 6540.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [189, 3721.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [169, 8279.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [182, 7690.0] + - - [448, 448, 1, 128, 448, 448, 448, 448] + - [184, 5375.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [182, 6054.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [149, 8272.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [190, 7874.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [142, 7889.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [149, 8387.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [160, 5802.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [144, 8951.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [190, 6730.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [177, 8294.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [165, 7470.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [177, 7055.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [184, 5361.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [144, 6388.0] + - - [64, 64, 36, 3136, 64, 64, 64, 64] + - [192, 8500.0] + - - [64, 64, 64, 826, 64, 64, 64, 64] + - [147, 8290.0] + - - [64, 64, 64, 1600, 64, 64, 64, 64] + - [147, 8804.0] + - - [64, 96, 64, 288, 64, 64, 64, 96] + - [190, 7832.0] + - - [96, 96, 36, 1568, 96, 96, 96, 96] + - [162, 8242.0] + - - [96, 96, 36, 2592, 96, 96, 96, 96] + - [188, 8323.0] + - - [64, 96, 64, 800, 64, 64, 64, 96] + - [196, 8456.0] + - - [35, 96, 36, 8960, 35, 35, 35, 96] + - [194, 4414.0] + - - [32, 64, 36, 43808, 32, 32, 32, 64] + - [149, 4524.0] + - - [64, 64, 64, 81, 64, 64, 64, 64] + - [138, 3962.0] + - - [64, 96, 36, 512, 64, 64, 64, 96] + - [140, 6456.0] + - - [64, 64, 64, 3200, 64, 64, 64, 64] + - [147, 9110.0] + - - [64, 64, 36, 3520, 64, 64, 64, 64] + - [192, 8642.0] + - - [64, 64, 64, 5408, 64, 64, 64, 64] + - [208, 8477.0] + - - [35, 96, 36, 13440, 35, 35, 35, 96] + - [194, 4317.0] + - - [96, 96, 64, 1152, 96, 96, 96, 96] + - [140, 9033.0] + - - [32, 64, 36, 90, 32, 32, 32, 64] + - [185, 2100.0] + - - [64, 64, 64, 800, 64, 64, 64, 64] + - [169, 8283.0] + - - [64, 64, 36, 1568, 64, 64, 64, 64] + - [165, 8147.0] + - - [64, 64, 36, 196, 64, 64, 64, 64] + - [160, 4722.0] + - - [35, 96, 64, 4235, 35, 35, 35, 96] + - [207, 4842.0] + - - [149, 32, 36, 19072, 149, 149, 149, 32] + - [175, 5441.0] + - - [64, 96, 36, 1568, 64, 64, 64, 96] + - [207, 7160.0] + - - [96, 96, 64, 800, 96, 96, 96, 96] + - [140, 9099.0] + - - [32, 64, 64, 640, 32, 32, 32, 64] + - [140, 6096.0] + - - [64, 64, 36, 392, 64, 64, 64, 64] + - [144, 6046.0] + - - [64, 64, 64, 1652, 64, 64, 64, 64] + - [147, 8820.0] + - - [64, 96, 36, 2592, 64, 64, 64, 96] + - [167, 7897.0] + - - [64, 64, 36, 6272, 64, 64, 64, 64] + - [149, 8850.0] + - - [32, 64, 64, 20000, 32, 32, 32, 64] + - [175, 4464.0] + - - [64, 64, 64, 648, 64, 64, 64, 64] + - [138, 7490.0] + - - [32, 64, 36, 1440, 32, 32, 32, 64] + - [143, 4685.0] + - - [64, 64, 64, 100, 64, 64, 64, 64] + - [138, 5328.0] + - - [64, 96, 64, 4608, 64, 64, 64, 96] + - [154, 8461.0] + - - [64, 64, 64, 200, 64, 64, 64, 64] + - [138, 5958.0] + - - [32, 64, 64, 40, 32, 32, 32, 64] + - [137, 2149.0] + - - [64, 96, 64, 1152, 64, 64, 64, 96] + - [169, 8665.0] + - - [149, 32, 64, 8195, 149, 149, 149, 32] + - [134, 5538.0] + - - [35, 96, 64, 6160, 35, 35, 35, 96] + - [147, 4704.0] + - - [64, 64, 36, 1760, 64, 64, 64, 64] + - [165, 7789.0] + - - [64, 2880, 1, 320, 64, 64, 64, 2880] + - [136, 6397.0] + - - [49, 832, 32, 256, 49, 49, 49, 832] + - [142, 7752.0] + - - [289, 1120, 1, 160, 289, 289, 289, 1120] + - [142, 6378.0] + - - [64, 1728, 1, 320, 64, 64, 64, 1728] + - [142, 4560.0] + - - [49, 832, 32, 160, 49, 49, 49, 832] + - [176, 7579.0] + - - [49, 832, 32, 384, 49, 49, 49, 832] + - [142, 7917.0] + - - [289, 896, 1, 192, 289, 289, 289, 896] + - [138, 6309.0] + - - [289, 896, 1, 128, 289, 289, 289, 896] + - [136, 5543.0] + - - [196, 800, 1, 64, 196, 196, 196, 800] + - [148, 2803.0] + - - [64, 1344, 1, 512, 64, 64, 64, 1344] + - [146, 4124.0] + - - [64, 1152, 1, 384, 64, 64, 64, 1152] + - [143, 3965.0] + - - [64, 1152, 1, 448, 64, 64, 64, 1152] + - [143, 4078.0] + - - [49, 832, 32, 128, 49, 49, 49, 832] + - [202, 7428.0] + - - [49, 832, 32, 48, 49, 49, 49, 832] + - [184, 6275.0] + - - [64, 1152, 1, 256, 64, 64, 64, 1152] + - [164, 3616.0] + - - [49, 832, 32, 32, 49, 49, 49, 832] + - [136, 5479.0] + - - [289, 1120, 1, 192, 289, 289, 289, 1120] + - [163, 6668.0] + - - [196, 600, 1, 64, 196, 196, 196, 600] + - [133, 2382.0] + - - [49, 832, 32, 192, 49, 49, 49, 832] + - [163, 7623.0] + - - [64, 1728, 1, 192, 64, 64, 64, 1728] + - [168, 3918.0] + - - [64, 38, 840, 38, 64, 64, 64, 38] + - [184, 5452.0] + - - [64, 49, 648, 49, 64, 64, 64, 49] + - [160, 6963.0] + - - [64, 32, 992, 32, 64, 64, 64, 32] + - [156, 6042.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [136, 4998.0] + - - [64, 41, 776, 41, 64, 64, 64, 41] + - [165, 5814.0] + - - [64, 45, 712, 45, 64, 64, 64, 45] + - [144, 6489.0] + - - [64, 54, 592, 54, 64, 64, 64, 54] + - [160, 7672.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [184, 8312.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [198, 7992.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [190, 7868.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [160, 4000.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [153, 1884.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [182, 4997.0] + - - [512, 512, 1, 3780, 512, 512, 512, 512] + - [147, 9134.0] + - - [512, 512, 1, 3796, 512, 512, 512, 512] + - [169, 9143.0] + - - [512, 512, 1, 3822, 512, 512, 512, 512] + - [196, 9145.0] + - - [512, 512, 1, 3840, 512, 512, 512, 512] + - [169, 9175.0] + - - [512, 512, 1, 3859, 512, 512, 512, 512] + - [196, 9148.0] + - - [512, 512, 1, 3870, 512, 512, 512, 512] + - [169, 9146.0] + - - [512, 512, 1, 3876, 512, 512, 512, 512] + - [196, 9147.0] + - - [512, 512, 1, 3906, 512, 512, 512, 512] + - [169, 9146.0] + - - [512, 512, 1, 3910, 512, 512, 512, 512] + - [147, 9148.0] + - - [512, 512, 1, 3925, 512, 512, 512, 512] + - [169, 9138.0] + - - [512, 512, 1, 3927, 512, 512, 512, 512] + - [147, 9134.0] + - - [512, 512, 1, 3942, 512, 512, 512, 512] + - [207, 9147.0] + - - [512, 512, 1, 3944, 512, 512, 512, 512] + - [147, 9158.0] + - - [512, 512, 1, 3955, 512, 512, 512, 512] + - [169, 9154.0] + - - [512, 512, 1, 3968, 512, 512, 512, 512] + - [169, 9155.0] + - - [512, 512, 1, 3969, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 3976, 512, 512, 512, 512] + - [169, 9156.0] + - - [512, 512, 1, 3977, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 3978, 512, 512, 512, 512] + - [196, 9154.0] + - - [512, 512, 1, 3990, 512, 512, 512, 512] + - [169, 9157.0] + - - [512, 512, 1, 3995, 512, 512, 512, 512] + - [180, 9166.0] + - - [512, 512, 1, 3996, 512, 512, 512, 512] + - [147, 9168.0] + - - [512, 512, 1, 3999, 512, 512, 512, 512] + - [147, 9183.0] + - - [512, 512, 1, 4005, 512, 512, 512, 512] + - [196, 9185.0] + - - [512, 512, 1, 4012, 512, 512, 512, 512] + - [147, 9153.0] + - - [512, 512, 1, 4020, 512, 512, 512, 512] + - [147, 9146.0] + - - [512, 512, 1, 4026, 512, 512, 512, 512] + - [196, 9157.0] + - - [512, 512, 1, 4030, 512, 512, 512, 512] + - [147, 9156.0] + - - [512, 512, 1, 4032, 512, 512, 512, 512] + - [169, 9196.0] + - - [512, 512, 1, 4050, 512, 512, 512, 512] + - [156, 9175.0] + - - [512, 512, 1, 4059, 512, 512, 512, 512] + - [147, 9187.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [201, 6348.0] + - - [384, 192, 1, 384, 384, 384, 384, 192] + - [143, 3965.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [147, 8637.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [169, 8788.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [156, 9044.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [196, 9087.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [147, 9124.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [156, 9156.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [169, 9171.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [147, 9217.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [147, 9206.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [169, 9206.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [147, 9221.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [169, 9215.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [156, 9225.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [169, 9240.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [196, 9246.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [169, 9265.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [147, 9272.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [196, 8920.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [169, 6227.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [182, 5503.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [167, 5924.0] + - - [768, 128, 1, 128, 768, 768, 768, 128] + - [143, 3146.0] + - - [1152, 128, 1, 128, 1152, 1152, 1152, 128] + - [152, 4369.0] + - - [1536, 128, 1, 128, 1536, 1536, 1536, 128] + - [173, 5115.0] + - - [1920, 128, 1, 128, 1920, 1920, 1920, 128] + - [136, 5617.0] + - - [768, 128, 1, 256, 768, 768, 768, 128] + - [152, 3884.0] + - - [1152, 128, 1, 256, 1152, 1152, 1152, 128] + - [201, 5825.0] + - - [1536, 128, 1, 256, 1536, 1536, 1536, 128] + - [136, 6420.0] + - - [1920, 128, 1, 256, 1920, 1920, 1920, 128] + - [138, 6665.0] + - - [448, 448, 1, 448, 448, 448, 448, 448] + - [160, 7322.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 32] + - [142, 9875.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 48] + - [171, 7706.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 48] + - [149, 7780.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 48] + - [149, 7777.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 32] + - [163, 9299.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 48] + - [149, 7517.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 48] + - [149, 7617.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 48] + - [149, 7617.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [198, 8207.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [163, 7562.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [165, 7577.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [198, 7918.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [147, 6915.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [147, 7419.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [147, 7499.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [147, 7980.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [206, 7460.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [158, 7982.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [144, 5436.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [144, 5401.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [165, 5561.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [156, 7618.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [184, 8253.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [160, 5499.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [160, 5454.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [173, 5631.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [159, 127.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [204, 7662.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [163, 6749.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [149, 7831.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [165, 8006.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [171, 7832.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [190, 7885.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [138, 6242.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [190, 9157.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [135, 730.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [141, 211.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [141, 638.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [141, 782.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [141, 178.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [141, 175.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [141, 207.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [141, 722.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [141, 621.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [141, 229.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [141, 210.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [141, 702.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [141, 192.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [141, 729.0] + - - [32, 32, 36, 43808, 32, 32, 32, 32] + - [179, 3472.0] + - - [32, 32, 64, 20000, 32, 32, 32, 32] + - [191, 3461.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [221, 7314.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 4] + - [236, 1199.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 4] + - [189, 636.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 4] + - [230, 803.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 4] + - [232, 1892.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 4] + - [230, 661.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 4] + - [189, 457.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 4] + - [227, 530.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 4] + - [228, 861.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 4] + - [230, 1327.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 4] + - [233, 810.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 4] + - [189, 518.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 4] + - [189, 319.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 4] + - [229, 1510.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 4] + - [230, 576.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 4] + - [227, 914.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 4] + - [237, 1862.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 4] + - [233, 711.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 4] + - [231, 1727.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 4] + - [230, 1091.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 4] + - [189, 365.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 4] + - [189, 290.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 4] + - [230, 503.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 4] + - [233, 1249.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 4] + - [233, 617.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 4] + - [230, 758.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 4] + - [233, 740.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 4] + - [230, 880.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 4] + - [235, 2092.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 4] + - [236, 1424.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 4] + - [189, 419.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 4] + - [189, 221.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 4] + - [238, 977.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 4] + - [230, 1580.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 4] + - [230, 996.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 4] + - [234, 1108.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 4] + - [235, 1671.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 1] + - [141, 145.0] + - - [2048, 1, 1, 960, 2048, 2048, 2048, 1] + - [146, 172.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [226, 5.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [226, 13.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [141, 21.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [226, 7.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 1856] + - [241, 745.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 2944] + - [249, 1006.0] + - - [4, 1408, 1, 128, 4, 4, 4, 1408] + - [189, 218.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 2368] + - [241, 814.0] + - - [4, 3584, 1, 128, 4, 4, 4, 3584] + - [187, 515.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 5888] + - [243, 1572.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 1408] + - [241, 566.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 6784] + - [249, 1579.0] + - - [4, 4288, 1, 128, 4, 4, 4, 4288] + - [187, 606.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 5056] + - [249, 1455.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 6784] + - [242, 1493.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 2944] + - [241, 1111.0] + - - [4, 5056, 1, 256, 4, 4, 4, 5056] + - [245, 973.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 5056] + - [244, 1355.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 2368] + - [241, 898.0] + - - [4, 1856, 1, 256, 4, 4, 4, 1856] + - [248, 411.0] + - - [4, 2368, 1, 256, 4, 4, 4, 2368] + - [174, 505.0] + - - [4, 2944, 1, 256, 4, 4, 4, 2944] + - [174, 620.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 4288] + - [249, 1402.0] + - - [4, 6784, 1, 128, 4, 4, 4, 6784] + - [247, 895.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 3584] + - [241, 1190.0] + - - [4, 5888, 1, 256, 4, 4, 4, 5888] + - [245, 1096.0] + - - [4, 6784, 1, 256, 4, 4, 4, 6784] + - [246, 1106.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1408] + - [250, 495.0] + - - [4, 3584, 1, 256, 4, 4, 4, 3584] + - [239, 711.0] + - - [4, 1408, 1, 256, 4, 4, 4, 1408] + - [240, 316.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 4288] + - [244, 1562.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 5888] + - [243, 1485.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1856] + - [249, 651.0] + - - [4, 1856, 1, 128, 4, 4, 4, 1856] + - [150, 285.0] + - - [4, 2944, 1, 128, 4, 4, 4, 2944] + - [240, 433.0] + - - [4, 5056, 1, 128, 4, 4, 4, 5056] + - [247, 700.0] + - - [4, 4288, 1, 256, 4, 4, 4, 4288] + - [245, 838.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3584] + - [249, 1314.0] + - - [4, 5888, 1, 128, 4, 4, 4, 5888] + - [247, 806.0] + - - [4, 2368, 1, 128, 4, 4, 4, 2368] + - [240, 352.0] + - - [49, 1200, 1, 128, 49, 49, 49, 1200] + - [135, 2034.0] + - - [1, 1152, 1, 256, 1, 1, 1, 1152] + - [143, 68.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [143, 1676.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [143, 603.0] + - - [16, 32, 36, 5760, 16, 16, 16, 32] + - [210, 3267.0] + - - [3, 64, 36, 6272, 3, 3, 3, 64] + - [222, 746.0] + - - [3, 64, 64, 46208, 3, 3, 3, 64] + - [193, 578.0] + - - [3, 64, 64, 92416, 3, 3, 3, 64] + - [178, 576.0] + - - [1, 16, 36, 23040, 1, 1, 1, 16] + - [214, 201.0] + - - [1, 16, 64, 10240, 1, 1, 1, 16] + - [210, 233.0] + - - [3, 64, 36, 25088, 3, 3, 3, 64] + - [220, 588.0] + - - [3, 64, 64, 11552, 3, 3, 3, 64] + - [219, 654.0] + - - [3, 64, 36, 200704, 3, 3, 3, 64] + - [209, 571.0] + - - [3, 64, 64, 23104, 3, 3, 3, 64] + - [205, 576.0] + - - [3, 64, 36, 100352, 3, 3, 3, 64] + - [216, 572.0] + - - [3, 64, 36, 50176, 3, 3, 3, 64] + - [224, 572.0] + - - [8, 384, 64, 6600, 8, 8, 8, 384] + - [210, 1527.0] + - - [65, 1024, 1, 6400, 65, 65, 65, 1024] + - [211, 4894.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [222, 2367.0] + - - [256, 1, 1, 32768, 256, 256, 256, 1] + - [215, 106.0] + - - [256, 4, 1, 6912, 256, 256, 256, 4] + - [215, 320.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [217, 2452.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [225, 408.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [212, 419.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [212, 422.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [217, 1718.0] + - - [256, 1, 1, 6912, 256, 256, 256, 1] + - [218, 81.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [213, 5685.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [223, 367.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [212, 408.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [143, 3474.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [164, 784.0] + - - [4, 704, 1, 1280, 4, 4, 4, 704] + - [143, 259.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [203, 507.0] + - - [64, 4, 1, 256, 64, 64, 64, 4] + - [164, 15.0] + - - [64, 704, 1, 128, 64, 64, 64, 704] + - [172, 1825.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [148, 2710.0] + - - [128, 4, 1, 1280, 128, 128, 128, 4] + - [143, 47.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [143, 4140.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [166, 3517.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 64] + - [164, 2527.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [164, 4144.0] + - - [4, 704, 1, 256, 4, 4, 4, 704] + - [143, 165.0] + - - [704, 4, 1, 1280, 704, 704, 704, 4] + - [197, 260.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [164, 1748.0] + - - [64, 1024, 1, 128, 64, 64, 64, 1024] + - [164, 2512.0] + - - [4, 64, 1, 1280, 4, 4, 4, 64] + - [143, 24.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [148, 3451.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [148, 2703.0] + - - [448, 4, 1, 256, 448, 448, 448, 4] + - [164, 103.0] + - - [448, 4, 1, 1280, 448, 448, 448, 4] + - [143, 165.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [143, 20.0] + - - [256, 4, 1, 128, 256, 256, 256, 4] + - [143, 41.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [193, 3777.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [143, 504.0] + - - [704, 64, 1, 128, 704, 704, 704, 64] + - [141, 1814.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 4] + - [143, 237.0] + - - [256, 256, 1, 128, 256, 256, 256, 256] + - [164, 2512.0] + - - [64, 256, 1, 128, 64, 64, 64, 256] + - [143, 704.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [166, 3512.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [143, 2834.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [164, 1279.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [148, 3089.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [148, 3020.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [164, 1389.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [148, 869.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [148, 1731.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [164, 1997.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [148, 3022.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [148, 4360.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 4] + - [148, 426.0] + - - [4, 4, 1, 256, 4, 4, 4, 4] + - [133, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [143, 1008.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [148, 3089.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [148, 777.0] + - - [4, 448, 1, 3328, 4, 4, 4, 448] + - [148, 188.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [164, 3214.0] + - - [256, 4, 1, 1280, 256, 256, 256, 4] + - [148, 95.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [141, 2507.0] + - - [4, 704, 1, 128, 4, 4, 4, 704] + - [143, 113.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [164, 619.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [164, 2823.0] + - - [448, 64, 1, 128, 448, 448, 448, 64] + - [164, 1223.0] + - - [4, 448, 1, 1280, 4, 4, 4, 448] + - [143, 165.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [143, 3214.0] + - - [256, 64, 1, 128, 256, 256, 256, 64] + - [164, 699.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 1024] + - [148, 427.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [197, 4357.0] + - - [704, 4, 1, 128, 704, 704, 704, 4] + - [164, 111.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [143, 59.0] + - - [256, 4, 1, 3328, 256, 256, 256, 4] + - [148, 107.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [164, 60.0] + - - [4, 4, 1, 128, 4, 4, 4, 4] + - [133, 1.0] + - - [4, 128, 1, 256, 4, 4, 4, 128] + - [143, 30.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [148, 388.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [148, 3818.0] + - - [4, 448, 1, 128, 4, 4, 4, 448] + - [143, 71.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [148, 1549.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [164, 2534.0] + - - [4, 128, 1, 3328, 4, 4, 4, 128] + - [148, 54.0] + - - [64, 4, 1, 128, 64, 64, 64, 4] + - [141, 10.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [143, 257.0] + - - [4, 704, 1, 3328, 4, 4, 4, 704] + - [148, 295.0] + - - [4, 4, 1, 1280, 4, 4, 4, 4] + - [133, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [141, 733.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 4] + - [143, 163.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [148, 434.0] + - - [4, 64, 1, 128, 4, 4, 4, 64] + - [141, 10.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [148, 777.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [148, 1556.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [164, 1997.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [148, 1551.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 4] + - [197, 377.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [141, 2486.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [139, 3652.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [148, 869.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [164, 1756.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [143, 1226.0] + - - [4, 256, 1, 128, 4, 4, 4, 256] + - [164, 41.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [164, 3226.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [164, 354.0] + - - [4, 4, 1, 3328, 4, 4, 4, 4] + - [143, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1024] + - [148, 377.0] + - - [704, 4, 1, 256, 704, 704, 704, 4] + - [143, 164.0] + - - [128, 4, 1, 3328, 128, 128, 128, 4] + - [148, 54.0] + - - [448, 4, 1, 3328, 448, 448, 448, 4] + - [148, 187.0] + - - [704, 4, 1, 3328, 704, 704, 704, 4] + - [148, 294.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [139, 3637.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [197, 4357.0] + - - [4, 1024, 1, 128, 4, 4, 4, 1024] + - [143, 165.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [148, 1735.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [164, 2198.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [164, 1398.0] + - - [128, 4, 1, 256, 128, 128, 128, 4] + - [143, 30.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [148, 4145.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [148, 3447.0] + - - [448, 4, 1, 128, 448, 448, 448, 4] + - [164, 71.0] + - - [4, 256, 1, 3328, 4, 4, 4, 256] + - [148, 108.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [141, 20.0] + - - [4, 256, 1, 1280, 4, 4, 4, 256] + - [148, 95.0] + - - [64, 4, 1, 3328, 64, 64, 64, 4] + - [148, 27.0] + - - [4, 64, 1, 3328, 4, 4, 4, 64] + - [148, 27.0] + - - [4, 1024, 1, 256, 4, 4, 4, 1024] + - [143, 239.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [164, 1008.0] + - - [4, 64, 1, 256, 4, 4, 4, 64] + - [143, 15.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [135, 2224.0] + - - [64, 448, 1, 128, 64, 64, 64, 448] + - [164, 1232.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [145, 3779.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [148, 3818.0] + - - [4, 448, 1, 256, 4, 4, 4, 448] + - [164, 104.0] + - - [4, 128, 1, 1280, 4, 4, 4, 128] + - [148, 48.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [164, 352.0] + - - [64, 64, 1, 128, 64, 64, 64, 64] + - [164, 177.0] + - - [64, 4, 1, 1280, 64, 64, 64, 4] + - [143, 24.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [148, 1731.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [164, 1013.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [142, 5292.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [136, 4489.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [184, 5855.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [152, 2027.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [136, 1435.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [136, 1834.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [136, 6285.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [135, 4200.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [135, 2688.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [173, 4953.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [135, 3880.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [152, 2369.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [135, 3619.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [135, 3369.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [135, 3024.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [184, 6647.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [160, 4062.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [184, 4355.0] + - - [768, 2, 1, 16, 768, 768, 768, 2] + - [135, 14.0] + - - [768, 2, 1, 32, 768, 768, 768, 2] + - [135, 25.0] + - - [3, 64, 64, 2888, 3, 3, 3, 64] + - [187, 769.0] + - - [1, 16, 64, 640, 1, 1, 1, 16] + - [146, 77.0] + - - [512, 24, 36, 800, 512, 512, 512, 24] + - [169, 7444.0] + - - [16, 32, 36, 360, 16, 16, 16, 32] + - [164, 1211.0] + - - [1, 16, 36, 1440, 1, 1, 1, 16] + - [143, 54.0] + - - [512, 24, 64, 512, 512, 512, 512, 24] + - [190, 7345.0] + - - [3, 64, 36, 3136, 3, 3, 3, 64] + - [145, 633.0] + - - [256, 24, 64, 32, 256, 256, 256, 24] + - [136, 3010.0] + - - [256, 16, 36, 3200, 256, 256, 256, 16] + - [195, 5023.0] + - - [256, 16, 36, 32, 256, 256, 256, 16] + - [183, 1787.0] + - - [512, 24, 36, 288, 512, 512, 512, 24] + - [142, 6894.0] + - - [512, 24, 64, 128, 512, 512, 512, 24] + - [163, 6729.0] + - - [3, 64, 64, 1444, 3, 3, 3, 64] + - [139, 746.0] + - - [16, 32, 64, 160, 16, 16, 16, 32] + - [153, 1417.0] + - - [256, 16, 64, 32, 256, 256, 256, 16] + - [159, 2605.0] + - - [256, 16, 64, 1568, 256, 256, 256, 16] + - [195, 5073.0] + - - [256, 24, 36, 128, 256, 256, 256, 24] + - [160, 4552.0] + - - [16, 32, 64, 2560, 16, 16, 16, 32] + - [145, 2962.0] + - - [49, 800, 1, 128, 49, 49, 49, 800] + - [200, 1520.0] + - - [64, 12, 2520, 12, 64, 64, 64, 12] + - [135, 2657.0] + - - [64, 13, 2336, 13, 64, 64, 64, 13] + - [135, 3001.0] + - - [64, 14, 2184, 14, 64, 64, 64, 14] + - [135, 3231.0] + - - [64, 15, 2048, 15, 64, 64, 64, 15] + - [135, 3470.0] + - - [64, 16, 1920, 16, 64, 64, 64, 16] + - [135, 3827.0] + - - [64, 17, 1816, 17, 64, 64, 64, 17] + - [160, 3434.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [160, 3778.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [201, 3887.0] + - - [64, 21, 1488, 21, 64, 64, 64, 21] + - [136, 4268.0] + - - [64, 23, 1360, 23, 64, 64, 64, 23] + - [136, 4698.0] + - - [64, 25, 1256, 25, 64, 64, 64, 25] + - [160, 5233.0] + - - [64, 27, 1168, 27, 64, 64, 64, 27] + - [173, 5527.0] + - - [64, 29, 1088, 29, 64, 64, 64, 29] + - [136, 5891.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [143, 154.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [170, 212.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [135, 8.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [151, 129.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [133, 891.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 1] + - [141, 51.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [151, 320.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [133, 1.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [170, 203.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [135, 9.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [135, 10.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [148, 29.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [148, 29.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [148, 30.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [139, 9.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [193, 11.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [148, 29.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [148, 30.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [148, 31.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [155, 9.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [135, 10.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [135, 11.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [135, 9.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [141, 9.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [166, 10.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [133, 34.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [141, 6.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [148, 1604.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [164, 4033.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [164, 2558.0] + - - [384, 128, 1, 128, 384, 384, 384, 128] + - [141, 1978.0] + - - [384, 128, 1, 256, 384, 384, 384, 128] + - [141, 2700.0] + - - [64, 64, 1, 64, 64, 64, 64, 64] + - [137, 113.0] + - - [256, 4, 1, 4096, 256, 256, 256, 4] + - [148, 109.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [134, 6326.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [168, 2914.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [168, 2840.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [139, 2350.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [139, 1412.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [133, 1467.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [141, 13.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [143, 189.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [137, 36.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [137, 39.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [185, 40.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [141, 7.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [148, 208.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [141, 11.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [143, 182.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [141, 12.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [143, 185.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [134, 615.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [134, 631.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [136, 630.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [136, 633.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [157, 3800.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [157, 3961.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [157, 4007.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [181, 4101.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [199, 3399.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [134, 3831.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [158, 3928.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [182, 4198.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [182, 4924.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [182, 5766.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [134, 5946.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [158, 6017.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [199, 5953.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [159, 3879.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [183, 4026.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [135, 4105.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [159, 4160.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [142, 4240.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [184, 4367.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [184, 4464.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [184, 4851.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [184, 5375.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [201, 5951.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [173, 6866.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [160, 6894.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [184, 7021.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [164, 2512.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [133, 1.0] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [133, 17.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1] + - [148, 98.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [151, 431.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [133, 2325.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [148, 16.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [137, 52.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [148, 691.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [148, 708.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [143, 574.0] + - - [256, 1, 1, 3456, 256, 256, 256, 1] + - [148, 27.0] + - - [256, 1, 1, 4096, 256, 256, 256, 1] + - [146, 27.0] + - - [256, 1, 1, 864, 256, 256, 256, 1] + - [143, 22.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [148, 3409.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [148, 3492.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [164, 2866.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [135, 60.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [150, 60.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [141, 15.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [173, 6134.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [160, 6593.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [143, 81.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [135, 67.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [148, 152.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [202, 4342.0] + - - [2, 1024, 1, 6, 2, 2, 2, 1024] + - [135, 8.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [148, 22.0] +- null diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB.yaml index 029df7ff8..42a3f1328 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB.yaml @@ -1,7 +1,7 @@ -- {MinimumRequiredVersion: 4.33.0} +- {MinimumRequiredVersion: 4.26.0} - navi31 - gfx1100 -- [Device 6863] +- [Device 744c] - AllowNoFreeDims: false AssignedDerivedParameters: true Batched: true @@ -11,7 +11,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -54,6 +53,85264 @@ ZeroPadA: [] ZeroPadB: [] - - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_SUS128_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_SUS128_TT8_16_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_SUS128_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SN_SU32_SUM3_SUS128_TT4_8_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SN_SU32_SUM3_SUS128_TT8_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_SUS128_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x16_SN_SU0_SUM0_SUS0_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SN_SU0_SUM0_SUS0_TT8_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_SUS128_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_SUS128_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SN_SU32_SUM3_SUS128_TT4_8_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU0_SUM0_SUS0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x16_SN_SU0_SUM0_SUS0_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SN_SU0_SUM0_SUS0_TT8_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_SUS0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 32 + MacroTileA: 256 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SN_SU0_SUM0_SUS0_TT8_8_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 16 + MacroTileA: 128 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x16x8_SN_SU32_SUM3_SUS128_TT4_4_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 2 + LSPB: 64 + LVCA: 64 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 256 + MacroTile1: 64 + MacroTileA: 256 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x64x8_SN_SU32_SUM3_SUS128_TT8_16_WG32_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_SUS0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_SUS128_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_SUS128_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_SUS128_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_SUS128_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 AggressivePerfMode: 1 AssertAlphaValue: false AssertBetaValue: false @@ -91,7 +85348,7 @@ DisableVgprOverlapping: false EdgeType: ShiftPtr EnableMatrixInstruction: false - ExpandPointerSwap: 0 + ExpandPointerSwap: true Fp16AltImpl: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 @@ -116,22 +85373,280 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 32 + LVCA: 64 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false LdsBlockSizePerPad: 0 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsInitCVgprs: false - LdsNumElements: 512 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 + LdsOffsetA_Blk: 512 LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -151,9 +85666,9 @@ MACInstruction: FMA MIArchVgpr: false MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MagicDivAlg: 2 @@ -171,13 +85686,13 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 OptPreLoopVmcnt: 0 PackBatchDims: 0 @@ -195,7 +85710,7 @@ PersistentKernelAlongBatch: false PrefetchAcrossPersistent: 0 PrefetchAcrossPersistentMode: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AllowNoFreeDims: false @@ -253,12 +85768,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_ + SolutionIndex: 364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_SUS0_TT2_2_WG16_8_1_WGM1 SourceSwap: false - StaggerU: 32 + SplitGlobalRead: 1 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StoreCInUnroll: false StoreCInUnrollExact: false StoreCInUnrollInterval: 1 @@ -268,9 +85784,9 @@ StoreSyncOpt: 0 StoreVectorWidth: 4 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -291,20 +85807,10644 @@ VectorWidth: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _DepthULds: 8 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 3 - allowLRVWforTLUandMI: false + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_SUS0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] -- - - [126, 126, 2, 66, 126, 126, 126, 66] - - [0, 0] -- null +- - - [1024, 4096, 1, 1024] + - [6, 80.733] + - - [4096, 4096, 1, 1024] + - [262, 90.843] + - - [1024, 2048, 1, 1024] + - [28, 77.191] + - - [4096, 2048, 1, 1024] + - [7, 86.359] + - - [768, 4096, 1, 2] + - [29, 4.033] + - - [768, 4096, 1, 768] + - [35, 81.414] + - - [3072, 4096, 1, 768] + - [7, 89.269] + - - [768, 2048, 1, 2] + - [6, 3.397] + - - [768, 2048, 1, 768] + - [16, 73.825] + - - [3072, 2048, 1, 768] + - [1, 87.347] + - - [3072, 1024, 1, 768] + - [29, 81.856] + - - [3072, 512, 1, 768] + - [31, 73.888] + - - [1024, 3072, 1, 1024] + - [35, 81.649] + - - [3072, 2048, 1, 1024] + - [23, 87.518] + - - [3072, 3072, 1, 1024] + - [23, 91.195] + - - [3072, 512, 1, 1024] + - [16, 74.227] + - - [3072, 4096, 1, 1024] + - [264, 89.282] + - - [1024, 2048, 1, 2] + - [6, 4.061] + - - [1024, 3072, 1, 2] + - [6, 4.507] + - - [1024, 4096, 1, 2] + - [6, 4.674] + - - [128, 128, 512, 64] + - [12, 70.004] + - - [512, 512, 64, 64] + - [6, 82.925] + - - [2944, 4288, 1, 1280] + - [1, 89.819] + - - [2368, 5888, 1, 256] + - [1, 85.348] + - - [5888, 1856, 1, 256] + - [29, 84.044] + - - [512, 24000, 1, 1536] + - [23, 90.14] + - - [5888, 1408, 1, 256] + - [1, 83.133] + - - [5888, 1856, 1, 3328] + - [1, 86.075] + - - [1856, 4288, 1, 256] + - [22, 81.477] + - - [1024, 5056, 1, 128] + - [6, 79.731] + - - [5056, 5056, 1, 3328] + - [30, 90.465] + - - [1408, 5888, 1, 1280] + - [1, 86.16] + - - [6144, 6000, 1, 2560] + - [1, 93.163] + - - [2368, 6784, 1, 128] + - [30, 82.831] + - - [1024, 3584, 1, 3328] + - [29, 82.019] + - - [512, 48000, 1, 2048] + - [39, 91.52] + - - [5888, 1408, 1, 1280] + - [1, 86.436] + - - [1408, 4288, 1, 256] + - [29, 82.483] + - - [1024, 2368, 1, 256] + - [29, 73.704] + - - [1408, 1856, 1, 1280] + - [29, 82.199] + - - [5056, 5056, 1, 1280] + - [1, 90.135] + - - [448, 5056, 1, 256] + - [10, 62.677] + - - [1856, 1408, 1, 128] + - [14, 69.481] + - - [6784, 256, 1, 3328] + - [31, 83.201] + - - [1408, 3584, 1, 256] + - [30, 82.348] + - - [4288, 448, 1, 256] + - [6, 71.994] + - - [1024, 1856, 1, 128] + - [0, 68.596] + - - [4288, 2944, 1, 1280] + - [36, 89.91] + - - [704, 5056, 1, 1280] + - [23, 76.379] + - - [2368, 704, 1, 3328] + - [0, 78.545] + - - [256, 5888, 1, 256] + - [35, 66.349] + - - [1856, 4288, 1, 3328] + - [29, 83.634] + - - [5888, 1024, 1, 256] + - [14, 82.294] + - - [1408, 2944, 1, 256] + - [29, 78.666] + - - [6784, 5056, 1, 3328] + - [1, 92.504] + - - [5056, 5056, 1, 256] + - [15, 87.424] + - - [704, 5056, 1, 128] + - [29, 68.429] + - - [2368, 2944, 1, 1280] + - [1, 83.386] + - - [6784, 6784, 1, 1280] + - [15, 92.901] + - - [1408, 4288, 1, 1280] + - [36, 84.099] + - - [3584, 4288, 1, 1280] + - [1, 89.806] + - - [512, 6000, 1, 2560] + - [35, 80.426] + - - [2368, 704, 1, 1280] + - [14, 77.227] + - - [5056, 4288, 1, 3328] + - [1, 90.46] + - - [3584, 2368, 1, 3328] + - [36, 88.407] + - - [5888, 6784, 1, 1280] + - [30, 93.181] + - - [6784, 448, 1, 1280] + - [14, 79.564] + - - [2944, 5888, 1, 256] + - [15, 89.314] + - - [4288, 2944, 1, 256] + - [1, 86.499] + - - [5888, 704, 1, 1280] + - [14, 80.634] + - - [448, 5888, 1, 128] + - [29, 63.313] + - - [5056, 2368, 1, 1280] + - [1, 85.475] + - - [448, 3584, 1, 1280] + - [22, 64.788] + - - [6784, 5888, 1, 256] + - [1, 91.367] + - - [1024, 1408, 1, 256] + - [29, 70.622] + - - [2368, 2368, 1, 3328] + - [0, 82.222] + - - [1856, 6784, 1, 128] + - [14, 80.828] + - - [5056, 704, 1, 3328] + - [0, 79.677] + - - [1408, 1856, 1, 256] + - [29, 79.095] + - - [2368, 5056, 1, 256] + - [14, 82.975] + - - [3584, 2368, 1, 1280] + - [23, 87.839] + - - [704, 5888, 1, 256] + - [23, 73.609] + - - [6784, 2944, 1, 128] + - [7, 88.033] + - - [2944, 6784, 1, 3328] + - [1, 93.0] + - - [3584, 704, 1, 3328] + - [29, 80.196] + - - [448, 4288, 1, 256] + - [35, 63.611] + - - [704, 2368, 1, 1280] + - [22, 67.464] + - - [1856, 2368, 1, 1280] + - [36, 83.192] + - - [1856, 4288, 1, 1280] + - [29, 83.332] + - - [256, 193600, 1, 64] + - [0, 68.082] + - - [704, 2944, 1, 128] + - [6, 62.668] + - - [1408, 1024, 1, 1280] + - [29, 79.104] + - - [704, 6784, 1, 256] + - [7, 75.192] + - - [6784, 704, 1, 256] + - [14, 80.106] + - - [5056, 1408, 1, 128] + - [6, 80.137] + - - [2048, 7000, 1, 2048] + - [23, 90.212] + - - [5056, 704, 1, 256] + - [6, 75.937] + - - [3584, 4288, 1, 3328] + - [30, 90.049] + - - [5888, 1856, 1, 1280] + - [30, 85.587] + - - [2368, 3584, 1, 1280] + - [1, 87.924] + - - [2944, 3584, 1, 3328] + - [1, 92.026] + - - [6784, 2944, 1, 256] + - [7, 90.262] + - - [1024, 1500, 1, 2560] + - [16, 73.198] + - - [1856, 2368, 1, 256] + - [15, 78.098] + - - [3584, 6784, 1, 3328] + - [30, 92.017] + - - [1024, 5888, 1, 3328] + - [7, 85.064] + - - [6144, 24000, 1, 2560] + - [1, 94.015] + - - [5056, 4288, 1, 1280] + - [1, 90.036] + - - [2368, 2368, 1, 1280] + - [14, 81.626] + - - [2944, 5888, 1, 128] + - [36, 86.824] + - - [704, 5888, 1, 1280] + - [36, 78.901] + - - [2368, 3584, 1, 128] + - [14, 80.349] + - - [1856, 5056, 1, 128] + - [0, 79.673] + - - [2944, 6784, 1, 1280] + - [36, 92.635] + - - [1024, 5056, 1, 1280] + - [30, 86.941] + - - [4288, 1024, 1, 256] + - [2, 78.265] + - - [2944, 2368, 1, 128] + - [29, 82.195] + - - [5888, 448, 1, 1280] + - [0, 76.329] + - - [704, 5888, 1, 3328] + - [5, 79.763] + - - [3584, 2944, 1, 256] + - [36, 88.335] + - - [1856, 2368, 1, 3328] + - [36, 84.342] + - - [512, 6000, 1, 2816] + - [22, 80.512] + - - [512, 24000, 1, 2048] + - [27, 90.031] + - - [1408, 5056, 1, 3328] + - [23, 85.407] + - - [1856, 1856, 1, 3328] + - [29, 77.241] + - - [2368, 2368, 1, 256] + - [14, 80.056] + - - [4288, 4288, 1, 1280] + - [23, 88.123] + - - [5888, 1024, 1, 1280] + - [1, 84.766] + - - [1024, 12544, 1, 256] + - [7, 87.943] + - - [5888, 448, 1, 128] + - [29, 70.92] + - - [512, 48000, 1, 2560] + - [36, 91.881] + - - [704, 6784, 1, 3328] + - [23, 81.707] + - - [5888, 5888, 1, 1280] + - [1, 93.23] + - - [5056, 1024, 1, 1280] + - [1, 87.347] + - - [448, 5888, 1, 3328] + - [22, 69.512] + - - [1024, 2944, 1, 1280] + - [29, 79.05] + - - [5056, 5888, 1, 1280] + - [1, 90.952] + - - [4288, 5888, 1, 128] + - [1, 85.655] + - - [1408, 3584, 1, 128] + - [35, 78.296] + - - [448, 3584, 1, 128] + - [14, 56.13] + - - [5888, 2944, 1, 1280] + - [1, 91.001] + - - [2368, 5888, 1, 128] + - [7, 82.447] + - - [3584, 5888, 1, 256] + - [7, 90.104] + - - [2368, 1024, 1, 128] + - [14, 67.13] + - - [2368, 704, 1, 128] + - [14, 64.612] + - - [3584, 2368, 1, 128] + - [29, 83.295] + - - [5056, 704, 1, 128] + - [0, 74.859] + - - [5056, 1408, 1, 3328] + - [1, 85.614] + - - [6784, 1024, 1, 3328] + - [1, 90.068] + - - [6784, 2944, 1, 3328] + - [1, 93.063] + - - [1856, 1856, 1, 256] + - [29, 73.112] + - - [6784, 2368, 1, 1280] + - [1, 90.289] + - - [4288, 3584, 1, 256] + - [15, 87.36] + - - [4288, 5888, 1, 1280] + - [1, 90.835] + - - [1024, 6000, 1, 1536] + - [23, 85.696] + - - [4288, 1856, 1, 1280] + - [0, 83.557] + - - [1856, 2944, 1, 3328] + - [36, 84.676] + - - [256, 6784, 1, 3328] + - [8, 82.695] + - - [512, 3000, 1, 1536] + - [24, 72.63] + - - [256, 5056, 1, 128] + - [29, 61.368] + - - [5056, 1024, 1, 256] + - [7, 83.656] + - - [5056, 1856, 1, 3328] + - [36, 86.589] + - - [4288, 1408, 1, 128] + - [14, 78.211] + - - [1856, 5888, 1, 3328] + - [36, 85.962] + - - [4288, 5056, 1, 256] + - [15, 87.536] + - - [4096, 7000, 1, 4096] + - [1, 92.531] + - - [5056, 256, 1, 3328] + - [29, 71.741] + - - [1024, 3000, 1, 2560] + - [22, 80.543] + - - [1024, 5888, 1, 1280] + - [15, 84.329] + - - [6784, 2368, 1, 128] + - [29, 84.956] + - - [1856, 1024, 1, 1280] + - [0, 76.731] + - - [6784, 4288, 1, 1280] + - [1, 90.708] + - - [1856, 1856, 1, 1280] + - [22, 76.532] + - - [3072, 24000, 1, 1024] + - [4, 91.859] + - - [1408, 5056, 1, 1280] + - [23, 84.748] + - - [5888, 1856, 1, 128] + - [35, 83.129] + - - [448, 6784, 1, 128] + - [6, 65.835] + - - [5056, 3584, 1, 128] + - [1, 84.487] + - - [5888, 5888, 1, 3328] + - [1, 93.542] + - - [6784, 1024, 1, 256] + - [30, 85.484] + - - [2944, 2368, 1, 256] + - [29, 83.769] + - - [5056, 5888, 1, 3328] + - [30, 91.304] + - - [1856, 1024, 1, 256] + - [29, 69.688] + - - [512, 48000, 1, 1536] + - [23, 91.597] + - - [3584, 448, 1, 1280] + - [29, 74.813] + - - [448, 5888, 1, 256] + - [22, 65.916] + - - [1408, 6784, 1, 3328] + - [23, 88.082] + - - [4288, 704, 1, 128] + - [29, 70.067] + - - [5056, 2944, 1, 256] + - [15, 87.681] + - - [6784, 5888, 1, 128] + - [1, 88.804] + - - [2944, 704, 1, 128] + - [29, 66.494] + - - [1408, 3584, 1, 3328] + - [23, 86.679] + - - [2368, 6784, 1, 256] + - [30, 86.269] + - - [5056, 1408, 1, 1280] + - [1, 84.978] + - - [5056, 4288, 1, 128] + - [30, 85.078] + - - [4288, 2368, 1, 3328] + - [1, 88.574] + - - [1408, 1856, 1, 128] + - [6, 72.977] + - - [1408, 5888, 1, 3328] + - [23, 86.815] + - - [6784, 6784, 1, 256] + - [1, 91.01] + - - [5888, 5056, 1, 128] + - [35, 85.89] + - - [4288, 2368, 1, 128] + - [35, 81.085] + - - [2368, 2944, 1, 256] + - [29, 79.993] + - - [3584, 1856, 1, 1280] + - [1, 85.47] + - - [6784, 6784, 1, 128] + - [1, 88.782] + - - [5888, 5056, 1, 256] + - [7, 88.534] + - - [8448, 48000, 1, 2816] + - [1, 94.381] + - - [512, 6000, 1, 2048] + - [35, 80.196] + - - [3584, 448, 1, 256] + - [35, 70.193] + - - [448, 4288, 1, 128] + - [29, 59.708] + - - [256, 6784, 1, 256] + - [35, 75.409] + - - [1408, 4288, 1, 128] + - [29, 80.823] + - - [2944, 704, 1, 3328] + - [21, 78.233] + - - [3584, 3584, 1, 256] + - [36, 88.552] + - - [3584, 5056, 1, 256] + - [30, 87.212] + - - [2944, 2368, 1, 1280] + - [0, 85.096] + - - [704, 6784, 1, 128] + - [29, 72.621] + - - [6784, 3584, 1, 256] + - [15, 90.18] + - - [1856, 1408, 1, 256] + - [2, 71.321] + - - [5056, 2368, 1, 128] + - [29, 82.195] + - - [2944, 2944, 1, 3328] + - [36, 90.379] + - - [5056, 6784, 1, 256] + - [1, 89.486] + - - [1856, 3584, 1, 128] + - [6, 77.417] + - - [3584, 6784, 1, 128] + - [7, 87.618] + - - [2368, 6784, 1, 1280] + - [36, 90.248] + - - [5056, 1856, 1, 256] + - [15, 83.052] + - - [1024, 3000, 1, 2816] + - [29, 80.715] + - - [1024, 1856, 1, 256] + - [29, 71.876] + - - [1408, 6784, 1, 1280] + - [7, 87.938] + - - [3584, 3584, 1, 1280] + - [1, 91.457] + - - [7680, 24000, 1, 2560] + - [1, 93.939] + - - [4608, 48000, 1, 1536] + - [23, 93.961] + - - [5888, 5888, 1, 128] + - [36, 88.019] + - - [5056, 2368, 1, 3328] + - [23, 85.804] + - - [2944, 4288, 1, 256] + - [30, 86.093] + - - [1408, 3584, 1, 1280] + - [23, 85.984] + - - [1024, 1500, 1, 2816] + - [7, 73.194] + - - [1024, 6000, 1, 2048] + - [7, 86.323] + - - [512, 24000, 1, 2560] + - [7, 90.374] + - - [6144, 3000, 1, 2560] + - [1, 90.704] + - - [2368, 6784, 1, 3328] + - [1, 90.492] + - - [1856, 1408, 1, 1280] + - [30, 76.122] + - - [6784, 704, 1, 128] + - [14, 78.581] + - - [5056, 2944, 1, 128] + - [1, 84.342] + - - [1408, 5888, 1, 256] + - [15, 83.819] + - - [704, 2944, 1, 1280] + - [17, 75.382] + - - [3584, 704, 1, 1280] + - [29, 79.564] + - - [5888, 2368, 1, 256] + - [36, 85.24] + - - [2944, 6784, 1, 128] + - [36, 88.1] + - - [3584, 448, 1, 3328] + - [29, 75.734] + - - [704, 2368, 1, 3328] + - [6, 68.064] + - - [4608, 6000, 1, 1536] + - [1, 91.448] + - - [256, 5888, 1, 128] + - [29, 58.58] + - - [2944, 2944, 1, 1280] + - [1, 89.937] + - - [5056, 448, 1, 3328] + - [14, 81.067] + - - [6784, 704, 1, 3328] + - [29, 82.195] + - - [5888, 4288, 1, 128] + - [0, 85.808] + - - [1408, 2944, 1, 3328] + - [29, 80.927] + - - [3584, 704, 1, 128] + - [29, 72.567] + - - [448, 5056, 1, 128] + - [14, 59.785] + - - [5056, 3584, 1, 256] + - [1, 87.234] + - - [4288, 4288, 1, 256] + - [7, 85.957] + - - [1408, 5056, 1, 128] + - [29, 80.99] + - - [2944, 3584, 1, 128] + - [1, 85.556] + - - [3584, 2368, 1, 256] + - [29, 84.622] + - - [5888, 5056, 1, 1280] + - [1, 90.965] + - - [8448, 24000, 1, 2816] + - [1, 94.101] + - - [3584, 3584, 1, 3328] + - [1, 91.899] + - - [3072, 1500, 1, 128] + - [35, 77.493] + - - [2048, 3136, 1, 512] + - [29, 84.672] + - - [3025, 256, 64, 64] + - [0, 54.7] + - - [5888, 6784, 1, 256] + - [1, 90.23] + - - [4288, 2944, 1, 3328] + - [1, 90.424] + - - [256, 5056, 1, 1280] + - [35, 71.163] + - - [2944, 5888, 1, 3328] + - [15, 91.371] + - - [6784, 5888, 1, 1280] + - [15, 93.433] + - - [5888, 4288, 1, 1280] + - [1, 90.65] + - - [1024, 24000, 1, 2048] + - [23, 91.173] + - - [5888, 3584, 1, 128] + - [30, 86.639] + - - [6784, 6784, 1, 3328] + - [1, 93.176] + - - [704, 3584, 1, 128] + - [14, 63.462] + - - [5888, 448, 1, 3328] + - [29, 76.415] + - - [2368, 4288, 1, 1280] + - [36, 87.794] + - - [4288, 2944, 1, 128] + - [1, 83.277] + - - [5056, 2944, 1, 3328] + - [1, 90.478] + - - [2944, 3584, 1, 256] + - [15, 87.875] + - - [1408, 1408, 1, 3328] + - [29, 80.877] + - - [3584, 3584, 1, 128] + - [1, 85.479] + - - [3584, 704, 1, 256] + - [14, 76.65] + - - [3584, 1408, 1, 3328] + - [1, 86.652] + - - [704, 3584, 1, 1280] + - [23, 73.69] + - - [1024, 1408, 1, 128] + - [0, 67.473] + - - [1856, 6784, 1, 256] + - [1, 83.53] + - - [4288, 448, 1, 3328] + - [0, 78.423] + - - [6784, 4288, 1, 128] + - [30, 86.178] + - - [6784, 704, 1, 1280] + - [29, 81.847] + - - [3584, 6784, 1, 256] + - [36, 89.526] + - - [5888, 1024, 1, 3328] + - [1, 85.217] + - - [704, 6784, 1, 1280] + - [36, 81.13] + - - [1856, 5056, 1, 3328] + - [23, 86.557] + - - [1024, 3584, 1, 128] + - [35, 75.053] + - - [2368, 2944, 1, 128] + - [6, 78.233] + - - [5888, 2944, 1, 3328] + - [1, 91.448] + - - [1408, 2368, 1, 128] + - [29, 74.457] + - - [5888, 2368, 1, 128] + - [29, 84.225] + - - [3584, 6784, 1, 1280] + - [1, 91.782] + - - [4288, 1856, 1, 256] + - [6, 81.752] + - - [1856, 5888, 1, 256] + - [1, 83.214] + - - [4288, 4288, 1, 3328] + - [1, 88.497] + - - [4288, 1408, 1, 1280] + - [7, 84.184] + - - [3584, 5056, 1, 128] + - [0, 84.915] + - - [4288, 2368, 1, 256] + - [30, 84.667] + - - [2944, 5056, 1, 1280] + - [1, 90.027] + - - [448, 6784, 1, 256] + - [16, 69.318] + - - [1856, 2368, 1, 128] + - [0, 75.72] + - - [6784, 2368, 1, 3328] + - [1, 90.677] + - - [1408, 6784, 1, 128] + - [29, 82.28] + - - [4288, 1856, 1, 3328] + - [14, 83.679] + - - [3584, 448, 1, 128] + - [29, 66.232] + - - [3584, 1024, 1, 1280] + - [0, 81.635] + - - [1856, 5056, 1, 256] + - [15, 82.574] + - - [6784, 4288, 1, 3328] + - [1, 91.078] + - - [1024, 4288, 1, 256] + - [29, 81.955] + - - [5888, 3584, 1, 3328] + - [1, 92.693] + - - [5056, 3584, 1, 3328] + - [1, 89.612] + - - [2368, 1408, 1, 1280] + - [30, 81.955] + - - [5056, 2944, 1, 1280] + - [1, 89.874] + - - [8448, 6000, 1, 2816] + - [1, 93.131] + - - [3584, 2944, 1, 1280] + - [1, 91.331] + - - [1024, 6784, 1, 256] + - [11, 84.617] + - - [6784, 448, 1, 256] + - [14, 77.114] + - - [5124, 9124, 1, 2048] + - [23, 90.298] + - - [2944, 5056, 1, 3328] + - [30, 90.266] + - - [2944, 1408, 1, 128] + - [29, 75.612] + - - [5056, 6784, 1, 3328] + - [1, 92.508] + - - [704, 2368, 1, 128] + - [14, 57.236] + - - [3072, 1500, 1, 1024] + - [23, 86.084] + - - [3584, 4288, 1, 256] + - [30, 86.594] + - - [1856, 6784, 1, 3328] + - [1, 86.539] + - - [5888, 4288, 1, 256] + - [30, 88.182] + - - [5056, 1408, 1, 256] + - [7, 81.536] + - - [3584, 1024, 1, 256] + - [29, 79.379] + - - [512, 6000, 1, 1536] + - [35, 79.79] + - - [5888, 5888, 1, 256] + - [1, 90.699] + - - [4288, 1024, 1, 1280] + - [1, 82.871] + - - [448, 6784, 1, 3328] + - [21, 76.911] + - - [2944, 1408, 1, 1280] + - [14, 80.507] + - - [3072, 6000, 1, 1024] + - [7, 90.437] + - - [2944, 1856, 1, 3328] + - [36, 84.595] + - - [3584, 5888, 1, 1280] + - [36, 92.4] + - - [6784, 1856, 1, 1280] + - [35, 86.3] + - - [2944, 5056, 1, 256] + - [15, 86.986] + - - [5888, 256, 1, 3328] + - [1, 72.643] + - - [2944, 4288, 1, 128] + - [29, 84.653] + - - [3584, 1408, 1, 256] + - [1, 82.158] + - - [704, 3584, 1, 3328] + - [23, 75.508] + - - [5056, 448, 1, 1280] + - [29, 79.997] + - - [3584, 1856, 1, 3328] + - [36, 86.323] + - - [4288, 6784, 1, 1280] + - [1, 90.645] + - - [1024, 3000, 1, 2048] + - [22, 80.209] + - - [2944, 1024, 1, 256] + - [14, 76.559] + - - [2368, 4288, 1, 3328] + - [1, 88.506] + - - [1024, 1408, 1, 1280] + - [35, 78.066] + - - [6784, 5056, 1, 256] + - [1, 89.738] + - - [1856, 1856, 1, 128] + - [6, 70.645] + - - [4288, 5888, 1, 256] + - [7, 88.962] + - - [2944, 6784, 1, 256] + - [1, 90.541] + - - [2944, 2944, 1, 128] + - [7, 83.53] + - - [1856, 3584, 1, 1280] + - [36, 85.448] + - - [3584, 1408, 1, 1280] + - [1, 85.93] + - - [4288, 448, 1, 128] + - [6, 66.349] + - - [5056, 256, 1, 1280] + - [29, 70.771] + - - [1856, 1408, 1, 3328] + - [15, 78.202] + - - [1024, 4288, 1, 3328] + - [29, 85.285] + - - [5056, 448, 1, 256] + - [0, 75.219] + - - [2944, 2368, 1, 3328] + - [29, 85.452] + - - [704, 4288, 1, 3328] + - [21, 76.767] + - - [1024, 1856, 1, 1280] + - [29, 76.934] + - - [6784, 1856, 1, 256] + - [29, 85.402] + - - [512, 48000, 1, 2816] + - [23, 92.179] + - - [512, 3000, 1, 2816] + - [37, 73.216] + - - [1024, 5888, 1, 256] + - [29, 82.064] + - - [6784, 1408, 1, 256] + - [7, 85.614] + - - [1408, 2368, 1, 256] + - [29, 78.017] + - - [1408, 1408, 1, 256] + - [29, 75.82] + - - [2368, 2368, 1, 128] + - [29, 76.884] + - - [6784, 1408, 1, 128] + - [14, 83.002] + - - [1408, 5056, 1, 256] + - [29, 82.371] + - - [512, 50176, 1, 128] + - [1, 87.591] + - - [4288, 3584, 1, 128] + - [14, 83.647] + - - [3584, 5056, 1, 1280] + - [7, 89.386] + - - [1856, 1024, 1, 128] + - [29, 64.486] + - - [1024, 24000, 1, 1536] + - [7, 91.371] + - - [704, 4288, 1, 256] + - [29, 68.745] + - - [5888, 2368, 1, 1280] + - [1, 87.952] + - - [6784, 1856, 1, 3328] + - [29, 86.607] + - - [2368, 5888, 1, 1280] + - [15, 87.708] + - - [5888, 256, 1, 1280] + - [1, 71.845] + - - [2368, 1856, 1, 3328] + - [1, 84.347] + - - [2944, 704, 1, 256] + - [14, 71.245] + - - [2368, 1024, 1, 3328] + - [0, 77.047] + - - [704, 3584, 1, 256] + - [36, 68.926] + - - [704, 2944, 1, 3328] + - [5, 77.547] + - - [6784, 1024, 1, 128] + - [23, 82.276] + - - [2944, 1024, 1, 3328] + - [14, 79.492] + - - [2944, 5056, 1, 128] + - [0, 84.658] + - - [1408, 6784, 1, 256] + - [15, 85.614] + - - [6784, 1408, 1, 3328] + - [1, 88.227] + - - [4288, 6784, 1, 128] + - [7, 85.93] + - - [1408, 2944, 1, 128] + - [35, 75.752] + - - [6784, 2944, 1, 1280] + - [1, 92.707] + - - [4288, 1856, 1, 128] + - [14, 79.298] + - - [1856, 2944, 1, 128] + - [0, 76.108] + - - [6784, 448, 1, 128] + - [29, 74.561] + - - [448, 5056, 1, 1280] + - [35, 70.875] + - - [4288, 5056, 1, 1280] + - [1, 90.108] + - - [2368, 1856, 1, 128] + - [14, 74.561] + - - [4288, 704, 1, 256] + - [6, 75.404] + - - [5888, 704, 1, 256] + - [29, 78.639] + - - [3584, 1024, 1, 128] + - [0, 76.79] + - - [256, 5888, 1, 3328] + - [23, 72.386] + - - [1408, 4288, 1, 3328] + - [23, 85.353] + - - [6784, 4288, 1, 256] + - [7, 88.466] + - - [5888, 256, 1, 256] + - [35, 65.794] + - - [6784, 1024, 1, 1280] + - [36, 89.558] + - - [5888, 1024, 1, 128] + - [29, 80.823] + - - [6784, 3584, 1, 1280] + - [1, 91.768] + - - [1024, 6784, 1, 1280] + - [23, 88.768] + - - [1408, 2944, 1, 1280] + - [0, 80.584] + - - [1408, 2368, 1, 3328] + - [36, 84.193] + - - [2944, 1856, 1, 128] + - [29, 80.417] + - - [256, 6784, 1, 128] + - [0, 69.841] + - - [5056, 6784, 1, 128] + - [15, 86.445] + - - [4288, 5056, 1, 128] + - [23, 84.017] + - - [1856, 5888, 1, 128] + - [0, 80.128] + - - [3584, 1856, 1, 256] + - [29, 83.201] + - - [4288, 3584, 1, 1280] + - [15, 89.666] + - - [704, 5888, 1, 128] + - [14, 70.365] + - - [6784, 3584, 1, 128] + - [36, 87.834] + - - [5124, 1500, 1, 2048] + - [23, 85.249] + - - [4288, 5056, 1, 3328] + - [1, 90.469] + - - [1408, 1408, 1, 128] + - [29, 68.885] + - - [5056, 2368, 1, 256] + - [14, 83.313] + - - [4288, 704, 1, 3328] + - [29, 79.447] + - - [448, 3584, 1, 256] + - [22, 60.525] + - - [2368, 1024, 1, 1280] + - [14, 76.248] + - - [2944, 1408, 1, 3328] + - [29, 80.891] + - - [6144, 1500, 1, 2560] + - [23, 89.937] + - - [1024, 1408, 1, 3328] + - [29, 79.939] + - - [2944, 5888, 1, 1280] + - [30, 91.051] + - - [5888, 3584, 1, 256] + - [36, 89.905] + - - [2368, 5056, 1, 128] + - [14, 81.707] + - - [1408, 1856, 1, 3328] + - [29, 82.831] + - - [5888, 5056, 1, 3328] + - [1, 91.389] + - - [7680, 6000, 1, 2560] + - [23, 92.982] + - - [6784, 1408, 1, 1280] + - [1, 87.852] + - - [512, 3000, 1, 2560] + - [36, 73.216] + - - [704, 2944, 1, 256] + - [8, 69.228] + - - [6784, 5888, 1, 3328] + - [1, 93.758] + - - [2368, 4288, 1, 128] + - [29, 80.394] + - - [1024, 6784, 1, 128] + - [7, 81.807] + - - [1024, 1500, 1, 1536] + - [7, 72.106] + - - [1408, 1408, 1, 1280] + - [29, 79.781] + - - [3072, 3000, 1, 1024] + - [23, 88.728] + - - [448, 4288, 1, 3328] + - [34, 72.215] + - - [2368, 1408, 1, 256] + - [14, 77.141] + - - [704, 2368, 1, 256] + - [22, 63.34] + - - [1024, 24000, 1, 2560] + - [23, 91.935] + - - [5888, 2368, 1, 3328] + - [30, 88.168] + - - [5124, 9124, 1, 1760] + - [15, 90.884] + - - [4288, 448, 1, 1280] + - [14, 76.965] + - - [5888, 704, 1, 3328] + - [29, 80.949] + - - [5056, 256, 1, 128] + - [6, 56.541] + - - [1024, 6784, 1, 3328] + - [36, 89.458] + - - [1408, 5888, 1, 128] + - [35, 81.202] + - - [512, 3136, 1, 2048] + - [24, 76.045] + - - [1408, 1024, 1, 256] + - [14, 73.685] + - - [8448, 1500, 1, 2816] + - [1, 90.465] + - - [2560, 7000, 1, 2560] + - [23, 91.168] + - - [5056, 6784, 1, 1280] + - [1, 92.017] + - - [704, 5056, 1, 3328] + - [23, 77.566] + - - [3584, 5056, 1, 3328] + - [36, 89.54] + - - [2368, 2944, 1, 3328] + - [1, 83.53] + - - [2368, 3584, 1, 256] + - [1, 84.342] + - - [4608, 3000, 1, 1536] + - [36, 90.437] + - - [5056, 3584, 1, 1280] + - [1, 89.409] + - - [5124, 9124, 1, 4096] + - [7, 90.474] + - - [7680, 48000, 1, 2560] + - [1, 94.281] + - - [1856, 2944, 1, 1280] + - [36, 83.336] + - - [4608, 1500, 1, 1536] + - [36, 88.66] + - - [1024, 48000, 1, 2816] + - [15, 93.09] + - - [5124, 9124, 1, 2560] + - [1, 90.604] + - - [2944, 1408, 1, 256] + - [14, 78.179] + - - [4288, 1408, 1, 3328] + - [1, 85.466] + - - [5888, 2944, 1, 128] + - [36, 85.84] + - - [2944, 1024, 1, 128] + - [14, 73.834] + - - [5124, 700, 1, 2048] + - [29, 79.578] + - - [6784, 5056, 1, 128] + - [1, 86.621] + - - [256, 12544, 1, 1024] + - [22, 82.894] + - - [5888, 1408, 1, 3328] + - [1, 86.882] + - - [2368, 1856, 1, 256] + - [14, 77.547] + - - [256, 5056, 1, 256] + - [35, 66.268] + - - [5056, 5056, 1, 128] + - [23, 85.087] + - - [448, 3584, 1, 3328] + - [22, 65.411] + - - [5888, 256, 1, 128] + - [6, 62.447] + - - [3584, 1856, 1, 128] + - [29, 81.491] + - - [4288, 4288, 1, 128] + - [0, 83.571] + - - [1856, 1024, 1, 3328] + - [14, 77.363] + - - [1856, 4288, 1, 128] + - [6, 78.486] + - - [1024, 6000, 1, 2560] + - [7, 86.476] + - - [1024, 5056, 1, 256] + - [29, 81.915] + - - [5056, 5888, 1, 128] + - [15, 86.49] + - - [2368, 1408, 1, 3328] + - [36, 83.914] + - - [1024, 48000, 1, 1536] + - [23, 92.626] + - - [5888, 448, 1, 256] + - [14, 72.508] + - - [5888, 6784, 1, 128] + - [1, 87.365] + - - [6784, 5056, 1, 1280] + - [15, 91.976] + - - [5056, 704, 1, 1280] + - [6, 78.838] + - - [1024, 48000, 1, 2560] + - [23, 93.0] + - - [1024, 2368, 1, 128] + - [29, 68.511] + - - [3072, 48000, 1, 1024] + - [27, 92.59] + - - [1024, 5888, 1, 128] + - [29, 79.605] + - - [3584, 5888, 1, 128] + - [7, 87.739] + - - [5056, 5888, 1, 256] + - [36, 88.867] + - - [2368, 1024, 1, 256] + - [0, 71.19] + - - [2944, 1856, 1, 256] + - [29, 82.366] + - - [1856, 6784, 1, 1280] + - [15, 86.165] + - - [8448, 3000, 1, 2816] + - [1, 91.398] + - - [6784, 448, 1, 3328] + - [0, 79.93] + - - [5056, 1856, 1, 1280] + - [23, 85.799] + - - [1408, 1024, 1, 3328] + - [14, 80.187] + - - [7680, 1500, 1, 2560] + - [36, 90.203] + - - [5888, 3584, 1, 1280] + - [23, 92.387] + - - [1856, 3584, 1, 3328] + - [36, 86.255] + - - [1024, 2944, 1, 256] + - [29, 75.991] + - - [448, 6784, 1, 1280] + - [21, 75.219] + - - [704, 5056, 1, 256] + - [6, 71.15] + - - [3584, 1024, 1, 3328] + - [14, 82.023] + - - [2944, 1856, 1, 1280] + - [14, 83.95] + - - [5056, 256, 1, 256] + - [0, 64.215] + - - [2944, 4288, 1, 3328] + - [30, 90.225] + - - [2368, 3584, 1, 3328] + - [36, 88.601] + - - [2944, 704, 1, 1280] + - [34, 76.14] + - - [2944, 3584, 1, 1280] + - [1, 91.57] + - - [1856, 5888, 1, 1280] + - [1, 85.682] + - - [4608, 24000, 1, 1536] + - [1, 93.623] + - - [4288, 1408, 1, 256] + - [6, 81.635] + - - [5888, 1408, 1, 128] + - [14, 81.59] + - - [4288, 2368, 1, 1280] + - [23, 88.042] + - - [6784, 2368, 1, 256] + - [1, 87.347] + - - [1024, 24000, 1, 2816] + - [30, 92.265] + - - [1856, 2944, 1, 256] + - [1, 78.184] + - - [5056, 1024, 1, 128] + - [6, 79.817] + - - [7680, 3000, 1, 2560] + - [1, 91.119] + - - [4224, 1500, 1, 176] + - [29, 80.99] + - - [5124, 700, 1, 2560] + - [14, 79.876] + - - [6784, 256, 1, 128] + - [29, 70.73] + - - [5888, 704, 1, 128] + - [14, 76.92] + - - [1024, 4288, 1, 1280] + - [29, 84.699] + - - [2368, 5056, 1, 3328] + - [36, 85.718] + - - [4288, 1024, 1, 3328] + - [1, 84.288] + - - [6144, 48000, 1, 2560] + - [1, 94.214] + - - [1024, 5056, 1, 3328] + - [23, 88.371] + - - [1024, 1856, 1, 3328] + - [29, 77.633] + - - [5124, 1500, 1, 2560] + - [36, 85.344] + - - [4288, 6784, 1, 256] + - [7, 88.43] + - - [3584, 2944, 1, 3328] + - [23, 91.895] + - - [5888, 2944, 1, 256] + - [30, 88.308] + - - [448, 4288, 1, 1280] + - [34, 70.117] + - - [1024, 4288, 1, 128] + - [0, 79.839] + - - [5056, 4288, 1, 256] + - [23, 87.825] + - - [1024, 3584, 1, 256] + - [29, 78.82] + - - [448, 5888, 1, 1280] + - [22, 68.844] + - - [512, 3000, 1, 2048] + - [8, 72.806] + - - [5056, 448, 1, 128] + - [6, 69.878] + - - [4288, 704, 1, 1280] + - [22, 78.495] + - - [3584, 2944, 1, 128] + - [1, 84.726] + - - [6784, 256, 1, 1280] + - [31, 82.249] + - - [2368, 5888, 1, 3328] + - [1, 88.046] + - - [2368, 1856, 1, 1280] + - [23, 82.826] + - - [448, 5056, 1, 3328] + - [35, 71.592] + - - [3584, 4288, 1, 128] + - [29, 84.365] + - - [1024, 6000, 1, 2816] + - [23, 86.508] + - - [5888, 4288, 1, 3328] + - [15, 91.015] + - - [2368, 704, 1, 256] + - [14, 69.611] + - - [3584, 1408, 1, 128] + - [0, 78.549] + - - [1856, 5056, 1, 1280] + - [36, 86.016] + - - [2944, 1024, 1, 1280] + - [29, 78.897] + - - [3584, 5888, 1, 3328] + - [1, 92.815] + - - [2368, 4288, 1, 256] + - [15, 83.462] + - - [1024, 2368, 1, 3328] + - [29, 77.087] + - - [1024, 2944, 1, 128] + - [14, 73.519] + - - [1024, 3584, 1, 1280] + - [29, 81.567] + - - [4288, 5888, 1, 3328] + - [1, 91.146] + - - [1024, 2944, 1, 3328] + - [29, 79.42] + - - [256, 6784, 1, 1280] + - [8, 81.013] + - - [1856, 3584, 1, 256] + - [30, 81.716] + - - [6784, 1856, 1, 128] + - [29, 84.608] + - - [1024, 1500, 1, 2048] + - [7, 72.684] + - - [512, 24000, 1, 2816] + - [23, 91.105] + - - [256, 5888, 1, 1280] + - [37, 70.92] + - - [4288, 6784, 1, 3328] + - [1, 90.983] + - - [2368, 1408, 1, 128] + - [14, 72.77] + - - [1408, 1024, 1, 128] + - [0, 68.028] + - - [6784, 3584, 1, 3328] + - [36, 92.003] + - - [1760, 7000, 1, 1760] + - [15, 88.29] + - - [2368, 5056, 1, 1280] + - [36, 85.434] + - - [1408, 2368, 1, 1280] + - [36, 82.7] + - - [704, 4288, 1, 128] + - [6, 66.083] + - - [2944, 2944, 1, 256] + - [15, 87.23] + - - [6784, 256, 1, 256] + - [14, 76.559] + - - [256, 5056, 1, 3328] + - [22, 72.066] + - - [5056, 1856, 1, 128] + - [14, 81.441] + - - [1024, 3000, 1, 1536] + - [35, 80.16] + - - [5056, 1024, 1, 3328] + - [1, 88.75] + - - [4288, 3584, 1, 3328] + - [15, 89.991] + - - [1024, 2368, 1, 1280] + - [29, 76.406] + - - [5888, 6784, 1, 3328] + - [1, 93.699] + - - [704, 4288, 1, 1280] + - [7, 74.493] + - - [128, 50176, 1, 512] + - [7, 86.508] + - - [1024, 48000, 1, 2048] + - [27, 92.089] + - - [4288, 1024, 1, 128] + - [22, 74.231] + - - [784, 128, 128, 512] + - [39, 70.708] + - - [784, 512, 256, 128] + - [0, 71.375] + - - [3136, 256, 256, 64] + - [10, 42.212] + - - [784, 512, 128, 128] + - [14, 67.356] + - - [784, 128, 256, 512] + - [39, 71.592] + - - [3136, 256, 128, 64] + - [0, 46.859] + - - [4096, 512, 1, 1024] + - [7, 75.91] + - - [2048, 768, 1, 512] + - [16, 71.948] + - - [4096, 512, 1, 2048] + - [34, 78.648] + - - [4096, 1024, 1, 2048] + - [29, 81.342] + - - [2048, 1024, 1, 2048] + - [40, 78.581] + - - [2048, 1024, 1, 4096] + - [28, 79.352] + - - [4096, 1024, 1, 1024] + - [29, 80.746] + - - [2048, 1024, 1, 512] + - [21, 74.737] + - - [4096, 1024, 1, 4096] + - [22, 81.599] + - - [2048, 1024, 1, 1024] + - [28, 76.762] + - - [4096, 384, 1, 2048] + - [7, 74.759] + - - [1225, 192, 64, 384] + - [0, 78.743] + - - [289, 128, 64, 1024] + - [22, 60.827] + - - [4096, 384, 1, 1536] + - [1, 74.872] + - - [289, 192, 64, 1024] + - [38, 61.391] + - - [4096, 384, 1, 1280] + - [36, 74.516] + - - [4096, 448, 1, 1280] + - [14, 74.367] + - - [289, 256, 64, 1024] + - [11, 63.593] + - - [4096, 448, 1, 2048] + - [22, 74.452] + - - [289, 384, 64, 1024] + - [11, 64.766] + - - [1024, 3594, 1, 4096] + - [6, 81.973] + - - [4096, 3103, 1, 1024] + - [7, 86.359] + - - [4096, 3136, 1, 1024] + - [7, 87.424] + - - [1024, 3141, 1, 4096] + - [40, 81.766] + - - [4096, 3559, 1, 1024] + - [7, 90.595] + - - [4096, 3368, 1, 1024] + - [7, 89.395] + - - [1024, 3335, 1, 4096] + - [7, 85.984] + - - [1024, 3510, 1, 4096] + - [6, 80.187] + - - [4096, 3209, 1, 1024] + - [7, 85.542] + - - [4096, 3322, 1, 1024] + - [7, 88.376] + - - [1024, 3400, 1, 4096] + - [7, 87.667] + - - [1024, 3995, 1, 4096] + - [6, 84.798] + - - [1024, 3503, 1, 4096] + - [6, 79.997] + - - [4096, 3594, 1, 1024] + - [7, 88.294] + - - [4096, 3473, 1, 1024] + - [7, 88.362] + - - [4096, 3522, 1, 1024] + - [7, 90.009] + - - [1024, 3103, 1, 4096] + - [6, 83.277] + - - [1024, 3214, 1, 4096] + - [28, 83.607] + - - [4096, 3449, 1, 1024] + - [7, 91.525] + - - [1024, 3136, 1, 4096] + - [6, 84.207] + - - [1024, 3955, 1, 33708] + - [30, 87.888] + - - [1024, 3780, 1, 4096] + - [7, 84.351] + - - [1024, 3906, 1, 33708] + - [7, 86.923] + - - [1024, 3386, 1, 4096] + - [23, 87.085] + - - [4096, 3396, 1, 1024] + - [7, 89.81] + - - [1024, 3183, 1, 4096] + - [40, 82.677] + - - [1024, 3098, 1, 4096] + - [22, 83.16] + - - [1024, 3548, 1, 4096] + - [6, 80.913] + - - [1024, 3224, 1, 4096] + - [28, 83.886] + - - [4096, 3469, 1, 1024] + - [7, 88.461] + - - [1024, 3582, 1, 4096] + - [22, 81.87] + - - [1024, 2977, 1, 4096] + - [22, 80.015] + - - [1024, 3939, 1, 1024] + - [23, 85.601] + - - [4096, 3176, 1, 1024] + - [7, 88.29] + - - [1024, 3559, 1, 4096] + - [22, 81.256] + - - [1024, 3478, 1, 4096] + - [6, 79.424] + - - [4096, 3343, 1, 1024] + - [7, 88.682] + - - [4096, 3440, 1, 1024] + - [7, 91.087] + - - [1024, 3996, 1, 33708] + - [0, 84.947] + - - [1024, 4012, 1, 4096] + - [6, 85.132] + - - [1024, 3322, 1, 4096] + - [13, 86.007] + - - [1024, 3990, 1, 33708] + - [0, 84.82] + - - [1024, 3314, 1, 4096] + - [40, 85.651] + - - [4096, 3513, 1, 1024] + - [7, 89.395] + - - [1024, 3562, 1, 4096] + - [22, 81.364] + - - [1024, 3443, 1, 4096] + - [7, 88.691] + - - [1024, 3554, 1, 4096] + - [22, 81.301] + - - [1024, 3063, 1, 4096] + - [22, 82.244] + - - [4096, 3460, 1, 1024] + - [7, 88.186] + - - [1024, 3209, 1, 4096] + - [40, 83.377] + - - [1024, 3147, 1, 4096] + - [28, 82.014] + - - [4096, 3387, 1, 1024] + - [7, 89.91] + - - [4096, 3436, 1, 1024] + - [7, 91.168] + - - [1024, 3341, 1, 4096] + - [7, 86.093] + - - [1024, 3516, 1, 4096] + - [6, 80.349] + - - [4096, 3277, 1, 1024] + - [7, 87.234] + - - [1024, 3454, 1, 4096] + - [23, 88.894] + - - [1024, 3969, 1, 4096] + - [22, 84.311] + - - [1024, 3999, 1, 4096] + - [22, 84.866] + - - [1024, 4032, 1, 4096] + - [22, 85.664] + - - [4096, 3541, 1, 1024] + - [7, 90.117] + - - [4096, 3334, 1, 1024] + - [7, 88.416] + - - [1024, 3365, 1, 4096] + - [23, 86.724] + - - [1024, 3527, 1, 4096] + - [22, 80.417] + - - [1024, 3190, 1, 4096] + - [28, 83.025] + - - [4096, 3906, 1, 1024] + - [7, 89.161] + - - [1024, 3593, 1, 4096] + - [6, 81.874] + - - [1024, 3336, 1, 4096] + - [23, 85.966] + - - [4096, 3504, 1, 1024] + - [7, 89.323] + - - [4096, 3977, 1, 1024] + - [7, 88.118] + - - [1024, 3906, 1, 4096] + - [23, 87.153] + - - [4096, 3415, 1, 1024] + - [7, 90.528] + - - [1024, 3295, 1, 4096] + - [40, 85.461] + - - [4096, 3321, 1, 1024] + - [7, 88.7] + - - [1024, 3072, 1, 4096] + - [290, 83.544] + - - [1024, 3408, 1, 4096] + - [23, 87.794] + - - [1024, 3522, 1, 4096] + - [22, 80.426] + - - [4096, 3751, 1, 1024] + - [7, 88.976] + - - [4096, 3378, 1, 1024] + - [7, 89.616] + - - [1024, 3925, 1, 33708] + - [7, 87.351] + - - [1024, 3990, 1, 1024] + - [35, 83.643] + - - [1024, 3290, 1, 4096] + - [28, 85.204] + - - [4096, 3500, 1, 1024] + - [7, 89.17] + - - [4096, 3565, 1, 1024] + - [7, 90.907] + - - [1024, 3484, 1, 4096] + - [22, 79.7] + - - [4096, 3395, 1, 1024] + - [7, 90.027] + - - [1024, 3681, 1, 1024] + - [22, 82.777] + - - [1024, 3584, 1, 1024] + - [22, 80.981] + - - [4096, 3093, 1, 1024] + - [7, 85.953] + - - [1024, 4050, 1, 1024] + - [22, 79.876] + - - [1024, 3301, 1, 4096] + - [40, 85.664] + - - [1024, 3581, 1, 4096] + - [6, 81.789] + - - [4096, 3374, 1, 1024] + - [7, 89.553] + - - [1024, 3449, 1, 4096] + - [7, 88.719] + - - [4096, 3215, 1, 1024] + - [7, 85.533] + - - [4096, 3312, 1, 1024] + - [7, 88.515] + - - [4096, 3479, 1, 1024] + - [7, 88.8] + - - [4096, 3544, 1, 1024] + - [7, 90.474] + - - [1024, 3263, 1, 4096] + - [40, 84.595] + - - [4096, 3455, 1, 1024] + - [7, 91.615] + - - [1024, 3379, 1, 4096] + - [23, 87.031] + - - [1024, 3490, 1, 4096] + - [6, 79.812] + - - [1024, 3368, 1, 4096] + - [23, 86.932] + - - [4096, 3186, 1, 1024] + - [7, 88.691] + - - [1024, 3428, 1, 4096] + - [23, 88.461] + - - [4096, 3561, 1, 1024] + - [7, 90.816] + - - [4096, 3418, 1, 1024] + - [7, 90.636] + - - [1024, 3064, 1, 4096] + - [22, 82.316] + - - [4096, 3259, 1, 1024] + - [7, 87.067] + - - [4096, 3308, 1, 1024] + - [7, 88.019] + - - [1024, 3533, 1, 4096] + - [22, 80.71] + - - [1024, 3344, 1, 4096] + - [23, 86.553] + - - [1024, 4030, 1, 1024] + - [22, 84.369] + - - [4096, 3459, 1, 1024] + - [7, 88.227] + - - [1024, 3572, 1, 4096] + - [6, 81.531] + - - [1024, 3925, 1, 1024] + - [7, 85.389] + - - [4096, 3435, 1, 1024] + - [7, 91.128] + - - [1024, 3956, 1, 4096] + - [23, 88.227] + - - [1024, 3463, 1, 4096] + - [22, 79.154] + - - [4096, 3182, 1, 1024] + - [7, 88.34] + - - [4096, 3976, 1, 1024] + - [7, 88.137] + - - [1024, 3417, 1, 4096] + - [7, 88.024] + - - [1024, 3528, 1, 4096] + - [22, 80.57] + - - [4096, 3446, 1, 1024] + - [7, 91.489] + - - [1024, 3543, 1, 4096] + - [6, 80.891] + - - [4096, 3287, 1, 1024] + - [7, 87.505] + - - [1024, 3499, 1, 4096] + - [22, 79.93] + - - [1024, 3231, 1, 4096] + - [13, 84.031] + - - [4096, 3519, 1, 1024] + - [7, 89.729] + - - [4096, 3552, 1, 1024] + - [7, 90.361] + - - [1024, 3458, 1, 4096] + - [22, 79.054] + - - [1024, 3374, 1, 4096] + - [23, 87.099] + - - [1024, 3396, 1, 4096] + - [23, 87.631] + - - [1024, 2967, 1, 4096] + - [22, 79.88] + - - [4096, 3482, 1, 1024] + - [7, 88.719] + - - [1024, 3226, 1, 4096] + - [13, 83.571] + - - [4096, 3377, 1, 1024] + - [7, 89.436] + - - [4096, 3426, 1, 1024] + - [7, 90.785] + - - [4096, 2935, 1, 1024] + - [7, 88.881] + - - [1024, 3439, 1, 4096] + - [7, 88.525] + - - [4096, 3267, 1, 1024] + - [7, 86.954] + - - [4096, 3499, 1, 1024] + - [7, 89.03] + - - [4096, 3356, 1, 1024] + - [7, 88.872] + - - [4096, 3939, 1, 1024] + - [7, 90.018] + - - [1024, 3526, 1, 4096] + - [22, 80.561] + - - [1024, 3859, 1, 33708] + - [1, 85.894] + - - [1024, 3385, 1, 4096] + - [7, 87.171] + - - [1024, 3496, 1, 4096] + - [22, 79.934] + - - [4096, 3141, 1, 1024] + - [7, 87.342] + - - [4096, 3510, 1, 1024] + - [7, 89.643] + - - [1024, 3434, 1, 4096] + - [7, 88.443] + - - [4096, 3969, 1, 1024] + - [7, 87.893] + - - [1024, 3121, 1, 4096] + - [22, 83.706] + - - [1024, 3232, 1, 4096] + - [40, 84.004] + - - [1024, 4030, 1, 33708] + - [0, 85.619] + - - [1024, 3780, 1, 33708] + - [1, 84.189] + - - [1024, 3969, 1, 1024] + - [35, 83.025] + - - [4096, 3527, 1, 1024] + - [7, 89.878] + - - [4096, 3336, 1, 1024] + - [7, 88.34] + - - [4096, 3290, 1, 1024] + - [7, 87.577] + - - [1024, 3469, 1, 4096] + - [6, 79.393] + - - [4096, 3490, 1, 1024] + - [7, 89.152] + - - [4096, 3064, 1, 1024] + - [7, 88.84] + - - [4096, 3582, 1, 1024] + - [7, 91.394] + - - [1024, 3956, 1, 1024] + - [23, 85.551] + - - [4096, 3417, 1, 1024] + - [7, 90.708] + - - [1024, 2736, 1, 4096] + - [23, 83.783] + - - [1024, 3205, 1, 4096] + - [28, 83.323] + - - [1024, 3143, 1, 4096] + - [28, 81.91] + - - [1024, 4020, 1, 4096] + - [6, 85.285] + - - [1024, 3318, 1, 4096] + - [28, 86.034] + - - [4096, 3364, 1, 1024] + - [7, 89.287] + - - [1024, 3353, 1, 4096] + - [7, 86.557] + - - [1024, 3464, 1, 4096] + - [6, 79.136] + - - [4096, 3205, 1, 1024] + - [7, 85.411] + - - [4096, 3318, 1, 1024] + - [7, 88.358] + - - [1024, 3402, 1, 4096] + - [23, 87.798] + - - [4096, 3181, 1, 1024] + - [7, 88.434] + - - [4096, 3550, 1, 1024] + - [7, 90.659] + - - [4096, 3445, 1, 1024] + - [7, 91.398] + - - [1024, 3138, 1, 4096] + - [40, 81.653] + - - [4096, 3079, 1, 1024] + - [7, 85.614] + - - [4096, 3144, 1, 1024] + - [7, 87.545] + - - [4096, 3860, 1, 1024] + - [7, 88.245] + - - [1024, 3515, 1, 4096] + - [6, 80.417] + - - [4096, 3408, 1, 1024] + - [7, 90.388] + - - [1024, 3181, 1, 4096] + - [28, 82.655] + - - [4096, 3298, 1, 1024] + - [7, 87.866] + - - [4096, 3585, 1, 1024] + - [7, 88.141] + - - [1024, 3550, 1, 4096] + - [22, 81.112] + - - [1024, 4020, 1, 1024] + - [35, 84.189] + - - [4096, 3481, 1, 1024] + - [7, 88.809] + - - [4096, 3530, 1, 1024] + - [7, 89.698] + - - [4096, 3425, 1, 1024] + - [7, 90.771] + - - [4096, 4026, 1, 1024] + - [7, 89.084] + - - [1024, 3860, 1, 1024] + - [23, 83.968] + - - [4096, 3975, 1, 1024] + - [7, 88.078] + - - [1024, 3286, 1, 4096] + - [40, 85.29] + - - [1024, 3176, 1, 4096] + - [28, 82.759] + - - [1024, 3894, 1, 4096] + - [23, 87.103] + - - [4096, 3355, 1, 1024] + - [7, 89.084] + - - [4096, 3404, 1, 1024] + - [7, 90.442] + - - [1024, 3501, 1, 4096] + - [22, 80.088] + - - [4096, 3245, 1, 1024] + - [7, 86.463] + - - [1024, 3431, 1, 4096] + - [23, 88.448] + - - [1024, 4000, 1, 1024] + - [35, 83.882] + - - [4096, 3509, 1, 1024] + - [7, 89.476] + - - [4096, 3558, 1, 1024] + - [7, 90.875] + - - [1024, 3535, 1, 4096] + - [22, 80.742] + - - [1024, 3414, 1, 4096] + - [23, 88.168] + - - [1024, 3445, 1, 4096] + - [7, 88.746] + - - [1024, 3436, 1, 4096] + - [7, 88.529] + - - [4096, 3472, 1, 1024] + - [7, 88.583] + - - [1024, 3211, 1, 4096] + - [40, 83.48] + - - [4096, 3383, 1, 1024] + - [7, 89.72] + - - [4096, 3448, 1, 1024] + - [7, 91.534] + - - [1024, 3343, 1, 4096] + - [23, 86.224] + - - [1024, 3518, 1, 4096] + - [6, 80.453] + - - [4096, 3289, 1, 1024] + - [7, 87.613] + - - [1024, 3440, 1, 4096] + - [7, 88.588] + - - [1024, 4032, 1, 33708] + - [14, 85.484] + - - [4096, 3489, 1, 1024] + - [7, 88.867] + - - [4096, 3346, 1, 1024] + - [7, 88.515] + - - [1024, 3534, 1, 4096] + - [6, 80.733] + - - [1024, 3079, 1, 4096] + - [22, 82.709] + - - [1024, 3955, 1, 4096] + - [23, 88.294] + - - [4096, 3236, 1, 1024] + - [7, 86.535] + - - [1024, 3545, 1, 4096] + - [22, 81.04] + - - [1024, 3144, 1, 4096] + - [28, 81.973] + - - [4096, 3780, 1, 1024] + - [7, 89.612] + - - [4096, 3163, 1, 1024] + - [7, 87.961] + - - [4096, 3468, 1, 1024] + - [7, 88.488] + - - [1024, 3539, 1, 4096] + - [22, 80.828] + - - [1024, 3541, 1, 4096] + - [22, 80.873] + - - [4096, 3363, 1, 1024] + - [7, 89.206] + - - [1024, 3475, 1, 4096] + - [35, 79.51] + - - [4096, 3110, 1, 1024] + - [7, 86.67] + - - [1024, 3509, 1, 4096] + - [22, 80.259] + - - [1024, 3413, 1, 4096] + - [23, 88.078] + - - [1024, 3975, 1, 1024] + - [22, 83.291] + - - [4096, 3549, 1, 1024] + - [7, 90.505] + - - [4096, 3342, 1, 1024] + - [7, 88.525] + - - [1024, 2985, 1, 4096] + - [35, 80.273] + - - [1024, 3876, 1, 33708] + - [7, 86.318] + - - [4096, 3280, 1, 1024] + - [7, 87.32] + - - [4096, 3191, 1, 1024] + - [7, 88.579] + - - [4096, 3512, 1, 1024] + - [7, 89.652] + - - [1024, 3560, 1, 4096] + - [22, 81.306] + - - [4096, 2499, 1, 1024] + - [7, 88.042] + - - [1024, 3248, 1, 4096] + - [13, 84.374] + - - [4096, 3423, 1, 1024] + - [7, 90.758] + - - [4096, 3297, 1, 1024] + - [7, 87.69] + - - [4096, 3154, 1, 1024] + - [7, 87.857] + - - [1024, 3303, 1, 4096] + - [40, 85.551] + - - [1024, 3222, 1, 4096] + - [28, 83.72] + - - [1024, 3978, 1, 1024] + - [35, 83.458] + - - [4096, 3529, 1, 1024] + - [7, 89.946] + - - [4096, 3386, 1, 1024] + - [7, 89.869] + - - [1024, 3451, 1, 4096] + - [23, 88.822] + - - [4096, 3562, 1, 1024] + - [7, 90.776] + - - [4096, 3276, 1, 1024] + - [7, 87.342] + - - [1024, 3894, 1, 33708] + - [7, 86.621] + - - [4096, 3540, 1, 1024] + - [7, 90.077] + - - [1024, 3416, 1, 4096] + - [23, 88.001] + - - [1024, 4005, 1, 33708] + - [14, 85.082] + - - [1024, 3942, 1, 4096] + - [7, 87.721] + - - [4096, 3403, 1, 1024] + - [7, 90.108] + - - [4096, 3381, 1, 1024] + - [7, 89.612] + - - [1024, 3492, 1, 4096] + - [22, 79.844] + - - [4096, 3101, 1, 1024] + - [7, 86.336] + - - [1024, 3430, 1, 4096] + - [23, 88.38] + - - [1024, 3977, 1, 4096] + - [22, 84.482] + - - [1024, 3640, 1, 4096] + - [22, 83.056] + - - [4096, 3557, 1, 1024] + - [7, 90.686] + - - [4096, 3414, 1, 1024] + - [7, 90.392] + - - [1024, 3391, 1, 4096] + - [23, 87.536] + - - [1024, 3356, 1, 4096] + - [7, 86.594] + - - [4096, 3320, 1, 1024] + - [1, 88.362] + - - [4096, 2765, 1, 1024] + - [7, 87.911] + - - [1024, 3411, 1, 4096] + - [23, 87.952] + - - [1024, 3978, 1, 4096] + - [22, 84.514] + - - [4096, 3487, 1, 1024] + - [7, 88.773] + - - [4096, 3520, 1, 1024] + - [7, 89.797] + - - [4096, 3942, 1, 1024] + - [7, 89.914] + - - [4096, 3431, 1, 1024] + - [7, 91.042] + - - [1024, 3271, 1, 4096] + - [40, 84.875] + - - [4096, 4020, 1, 1024] + - [7, 89.048] + - - [1024, 3481, 1, 4096] + - [6, 79.506] + - - [1024, 3419, 1, 4096] + - [23, 88.209] + - - [1024, 4059, 1, 4096] + - [22, 80.846] + - - [4096, 3345, 1, 1024] + - [7, 88.696] + - - [4096, 3394, 1, 1024] + - [7, 90.027] + - - [1024, 3298, 1, 4096] + - [40, 85.47] + - - [4096, 3235, 1, 1024] + - [7, 86.237] + - - [1024, 3681, 1, 33708] + - [14, 84.022] + - - [1024, 3362, 1, 4096] + - [7, 86.508] + - - [4096, 3467, 1, 1024] + - [7, 88.353] + - - [1024, 3349, 1, 4096] + - [23, 86.26] + - - [1024, 3460, 1, 4096] + - [6, 79.158] + - - [4096, 3214, 1, 1024] + - [7, 85.308] + - - [1024, 3398, 1, 4096] + - [23, 87.559] + - - [4096, 3478, 1, 1024] + - [7, 88.281] + - - [1024, 4050, 1, 33708] + - [14, 80.868] + - - [1024, 3244, 1, 4096] + - [40, 83.945] + - - [4096, 3341, 1, 1024] + - [7, 88.682] + - - [4096, 3454, 1, 1024] + - [7, 91.38] + - - [1024, 3166, 1, 4096] + - [40, 82.334] + - - [1024, 3425, 1, 4096] + - [23, 88.547] + - - [4096, 3295, 1, 1024] + - [7, 87.721] + - - [4096, 3072, 1, 1024] + - [262, 89.375] + - - [4096, 3822, 1, 1024] + - [7, 90.744] + - - [1024, 3681, 1, 4096] + - [6, 83.751] + - - [1024, 4050, 1, 4096] + - [22, 80.62] + - - [4096, 3495, 1, 1024] + - [7, 89.057] + - - [4096, 3560, 1, 1024] + - [7, 90.668] + - - [1024, 3524, 1, 4096] + - [22, 80.467] + - - [1024, 3942, 1, 33708] + - [7, 87.613] + - - [1024, 3304, 1, 4096] + - [40, 85.19] + - - [1024, 3387, 1, 4096] + - [23, 87.207] + - - [1024, 3498, 1, 4096] + - [22, 79.785] + - - [4096, 3458, 1, 1024] + - [7, 88.155] + - - [4096, 2967, 1, 1024] + - [7, 86.052] + - - [4096, 3385, 1, 1024] + - [7, 89.589] + - - [4096, 3434, 1, 1024] + - [7, 91.06] + - - [1024, 3519, 1, 4096] + - [22, 80.363] + - - [1024, 3511, 1, 4096] + - [6, 80.295] + - - [1024, 3288, 1, 4096] + - [28, 85.42] + - - [1024, 2918, 1, 4096] + - [6, 78.576] + - - [4096, 3573, 1, 1024] + - [7, 90.929] + - - [1024, 3822, 1, 33708] + - [1, 85.244] + - - [4096, 3539, 1, 1024] + - [7, 90.095] + - - [4096, 3332, 1, 1024] + - [7, 88.24] + - - [4096, 3286, 1, 1024] + - [7, 87.329] + - - [1024, 4026, 1, 4096] + - [6, 85.38] + - - [1024, 3277, 1, 4096] + - [28, 85.073] + - - [1024, 3471, 1, 4096] + - [6, 79.415] + - - [4096, 3518, 1, 1024] + - [7, 89.576] + - - [1024, 3393, 1, 4096] + - [23, 87.338] + - - [4096, 3413, 1, 1024] + - [7, 90.374] + - - [4096, 3303, 1, 1024] + - [7, 87.834] + - - [1024, 3207, 1, 4096] + - [40, 83.332] + - - [1024, 3894, 1, 1024] + - [23, 84.635] + - - [1024, 3977, 1, 1024] + - [35, 83.494] + - - [4096, 3535, 1, 1024] + - [7, 90.095] + - - [4096, 3376, 1, 1024] + - [7, 89.58] + - - [1024, 3355, 1, 4096] + - [23, 86.535] + - - [1024, 3466, 1, 4096] + - [22, 79.248] + - - [4096, 3266, 1, 1024] + - [7, 86.747] + - - [1024, 3404, 1, 4096] + - [7, 87.69] + - - [1024, 3999, 1, 1024] + - [35, 83.895] + - - [4096, 3498, 1, 1024] + - [7, 89.224] + - - [1024, 4032, 1, 1024] + - [35, 84.613] + - - [1024, 3410, 1, 4096] + - [23, 87.875] + - - [4096, 3393, 1, 1024] + - [7, 89.968] + - - [1024, 3140, 1, 4096] + - [40, 81.658] + - - [1024, 3910, 1, 33708] + - [7, 87.072] + - - [1024, 3334, 1, 4096] + - [7, 85.781] + - - [4096, 3140, 1, 1024] + - [7, 87.225] + - - [1024, 4005, 1, 4096] + - [6, 84.978] + - - [1024, 3579, 1, 4096] + - [22, 81.671] + - - [4096, 3372, 1, 1024] + - [7, 89.278] + - - [1024, 3245, 1, 4096] + - [40, 84.414] + - - [4096, 3956, 1, 1024] + - [7, 90.126] + - - [4096, 3213, 1, 1024] + - [7, 85.542] + - - [1024, 3361, 1, 4096] + - [23, 86.711] + - - [1024, 3536, 1, 4096] + - [22, 80.805] + - - [4096, 3477, 1, 1024] + - [7, 88.583] + - - [4096, 3526, 1, 1024] + - [7, 89.941] + - - [1024, 4005, 1, 1024] + - [35, 83.986] + - - [1024, 3530, 1, 4096] + - [6, 80.643] + - - [1024, 3944, 1, 4096] + - [7, 88.019] + - - [4096, 3453, 1, 1024] + - [7, 91.268] + - - [4096, 3184, 1, 1024] + - [7, 88.457] + - - [4096, 3579, 1, 1024] + - [7, 91.168] + - - [4096, 3351, 1, 1024] + - [7, 88.84] + - - [4096, 3416, 1, 1024] + - [7, 90.695] + - - [1024, 3822, 1, 4096] + - [23, 85.393] + - - [1024, 3796, 1, 4096] + - [7, 84.64] + - - [4096, 3257, 1, 1024] + - [7, 86.639] + - - [4096, 3306, 1, 1024] + - [7, 87.888] + - - [1024, 3505, 1, 4096] + - [22, 80.115] + - - [1024, 3315, 1, 4096] + - [13, 85.998] + - - [1024, 3486, 1, 4096] + - [6, 79.524] + - - [4096, 3457, 1, 1024] + - [7, 88.416] + - - [4096, 3870, 1, 1024] + - [7, 88.398] + - - [1024, 3447, 1, 4096] + - [7, 88.7] + - - [1024, 3558, 1, 4096] + - [22, 81.261] + - - [4096, 3433, 1, 1024] + - [7, 90.866] + - - [4096, 3180, 1, 1024] + - [7, 88.227] + - - [1024, 3213, 1, 4096] + - [40, 83.332] + - - [1024, 3900, 1, 4096] + - [7, 87.058] + - - [4096, 3444, 1, 1024] + - [7, 91.146] + - - [1024, 3504, 1, 4096] + - [22, 80.074] + - - [4096, 4059, 1, 1024] + - [7, 89.517] + - - [1024, 3442, 1, 4096] + - [7, 88.398] + - - [4096, 3517, 1, 1024] + - [7, 89.305] + - - [1024, 3566, 1, 4096] + - [6, 81.319] + - - [4096, 3248, 1, 1024] + - [7, 86.612] + - - [1024, 3547, 1, 4096] + - [22, 80.9] + - - [1024, 3340, 1, 4096] + - [23, 86.242] + - - [4096, 3480, 1, 1024] + - [7, 88.795] + - - [4096, 3424, 1, 1024] + - [7, 90.866] + - - [1024, 3906, 1, 1024] + - [23, 84.888] + - - [4096, 3265, 1, 1024] + - [7, 87.045] + - - [1024, 3384, 1, 4096] + - [7, 87.306] + - - [1024, 3494, 1, 4096] + - [22, 79.817] + - - [1024, 3236, 1, 4096] + - [13, 84.157] + - - [4096, 3497, 1, 1024] + - [7, 89.174] + - - [4096, 3354, 1, 1024] + - [7, 88.89] + - - [4096, 3055, 1, 1024] + - [7, 88.371] + - - [4096, 3244, 1, 1024] + - [7, 86.332] + - - [4096, 3139, 1, 1024] + - [7, 87.424] + - - [4096, 3508, 1, 1024] + - [7, 89.49] + - - [4096, 4050, 1, 1024] + - [7, 89.67] + - - [1024, 3472, 1, 4096] + - [22, 79.402] + - - [1024, 3861, 1, 1024] + - [23, 83.828] + - - [1024, 3910, 1, 1024] + - [23, 84.906] + - - [4096, 3371, 1, 1024] + - [7, 89.594] + - - [1024, 3751, 1, 4096] + - [7, 83.814] + - - [4096, 3325, 1, 1024] + - [7, 88.389] + - - [1024, 3321, 1, 4096] + - [40, 85.885] + - - [1024, 3944, 1, 1024] + - [7, 85.488] + - - [4096, 3525, 1, 1024] + - [7, 89.792] + - - [4096, 3382, 1, 1024] + - [7, 89.549] + - - [1024, 3453, 1, 4096] + - [7, 88.845] + - - [4096, 3564, 1, 1024] + - [7, 90.668] + - - [4096, 3288, 1, 1024] + - [7, 87.649] + - - [1024, 3925, 1, 4096] + - [23, 87.293] + - - [1024, 3057, 1, 4096] + - [22, 82.109] + - - [4096, 3488, 1, 1024] + - [7, 88.759] + - - [4096, 3046, 1, 1024] + - [7, 88.254] + - - [1024, 3189, 1, 4096] + - [40, 82.871] + - - [4096, 3399, 1, 1024] + - [7, 89.973] + - - [1024, 3383, 1, 4096] + - [23, 87.248] + - - [1024, 3415, 1, 4096] + - [7, 87.965] + - - [1024, 3388, 1, 4096] + - [23, 87.496] + - - [1024, 3376, 1, 4096] + - [23, 87.284] + - - [1024, 3473, 1, 4096] + - [6, 79.424] + - - [4096, 3162, 1, 1024] + - [7, 88.006] + - - [1024, 3448, 1, 4096] + - [23, 88.719] + - - [4096, 3362, 1, 1024] + - [7, 89.197] + - - [1024, 3262, 1, 4096] + - [40, 84.847] + - - [1024, 3184, 1, 4096] + - [13, 82.844] + - - [1024, 3378, 1, 4096] + - [23, 87.311] + - - [4096, 3548, 1, 1024] + - [7, 90.532] + - - [4096, 2977, 1, 1024] + - [7, 86.359] + - - [4096, 3443, 1, 1024] + - [7, 91.069] + - - [1024, 3289, 1, 4096] + - [28, 85.159] + - - [1024, 3483, 1, 4096] + - [22, 79.609] + - - [4096, 3190, 1, 1024] + - [7, 88.759] + - - [1024, 3421, 1, 4096] + - [23, 88.2] + - - [1024, 3514, 1, 4096] + - [22, 80.295] + - - [1024, 3532, 1, 4096] + - [22, 80.656] + - - [1024, 3565, 1, 4096] + - [22, 81.459] + - - [4096, 3422, 1, 1024] + - [7, 90.731] + - - [4096, 3263, 1, 1024] + - [7, 86.968] + - - [4096, 3296, 1, 1024] + - [7, 87.527] + - - [4096, 3640, 1, 1024] + - [7, 89.526] + - - [4096, 3463, 1, 1024] + - [7, 88.412] + - - [4096, 3528, 1, 1024] + - [7, 89.977] + - - [1024, 3351, 1, 4096] + - [7, 86.278] + - - [1024, 3462, 1, 4096] + - [22, 79.19] + - - [4096, 3226, 1, 1024] + - [7, 86.129] + - - [4096, 3439, 1, 1024] + - [7, 91.024] + - - [4096, 3121, 1, 1024] + - [7, 86.941] + - - [1024, 4059, 1, 33708] + - [14, 81.13] + - - [1024, 3311, 1, 4096] + - [40, 85.42] + - - [1024, 3230, 1, 4096] + - [28, 83.932] + - - [4096, 3353, 1, 1024] + - [7, 88.989] + - - [4096, 3402, 1, 1024] + - [7, 90.32] + - - [1024, 3427, 1, 4096] + - [36, 88.344] + - - [1024, 3346, 1, 4096] + - [23, 86.544] + - - [1024, 3126, 1, 4096] + - [22, 83.855] + - - [1024, 3796, 1, 1024] + - [23, 82.515] + - - [1024, 3990, 1, 4096] + - [22, 84.649] + - - [1024, 3257, 1, 4096] + - [13, 84.676] + - - [4096, 3996, 1, 1024] + - [7, 88.47] + - - [1024, 3306, 1, 4096] + - [40, 85.836] + - - [1024, 3389, 1, 4096] + - [23, 87.609] + - - [1024, 3500, 1, 4096] + - [6, 80.015] + - - [1024, 3999, 1, 33708] + - [14, 84.933] + - - [4096, 3486, 1, 1024] + - [7, 88.881] + - - [1024, 3438, 1, 4096] + - [7, 88.326] + - - [4096, 3616, 1, 1024] + - [7, 88.664] + - - [1024, 3955, 1, 1024] + - [7, 85.213] + - - [4096, 3430, 1, 1024] + - [7, 90.938] + - - [4096, 3271, 1, 1024] + - [7, 86.918] + - - [1024, 3364, 1, 4096] + - [23, 86.756] + - - [1024, 3497, 1, 4096] + - [22, 79.925] + - - [4096, 3503, 1, 1024] + - [7, 89.377] + - - [4096, 3344, 1, 1024] + - [7, 88.484] + - - [1024, 3457, 1, 4096] + - [22, 78.933] + - - [4096, 3466, 1, 1024] + - [7, 88.249] + - - [1024, 3976, 1, 33708] + - [22, 84.351] + - - [1024, 3395, 1, 4096] + - [23, 87.433] + - - [4096, 3361, 1, 1024] + - [7, 88.894] + - - [1024, 3751, 1, 33708] + - [7, 83.521] + - - [1024, 3822, 1, 1024] + - [36, 82.064] + - - [4096, 3315, 1, 1024] + - [7, 88.502] + - - [1024, 3163, 1, 4096] + - [40, 81.888] + - - [4096, 3547, 1, 1024] + - [7, 90.108] + - - [4096, 3340, 1, 1024] + - [7, 88.515] + - - [1024, 3296, 1, 4096] + - [40, 85.226] + - - [1024, 3468, 1, 4096] + - [6, 79.113] + - - [4096, 3294, 1, 1024] + - [7, 87.536] + - - [1024, 3406, 1, 4096] + - [23, 87.627] + - - [1024, 3860, 1, 33708] + - [1, 85.912] + - - [1024, 3584, 1, 4096] + - [22, 81.838] + - - [4096, 3189, 1, 1024] + - [7, 88.308] + - - [4096, 3494, 1, 1024] + - [7, 89.043] + - - [1024, 3093, 1, 4096] + - [22, 82.831] + - - [4096, 3421, 1, 1024] + - [7, 90.428] + - - [1024, 3479, 1, 4096] + - [6, 79.33] + - - [1024, 3433, 1, 4096] + - [7, 88.497] + - - [4096, 3311, 1, 1024] + - [7, 87.735] + - - [1024, 3381, 1, 4096] + - [7, 87.031] + - - [1024, 3996, 1, 4096] + - [35, 84.789] + - - [4096, 3384, 1, 1024] + - [7, 89.698] + - - [1024, 3247, 1, 4096] + - [40, 84.256] + - - [1024, 3169, 1, 4096] + - [28, 82.402] + - - [1024, 3088, 1, 4096] + - [22, 82.939] + - - [1024, 3363, 1, 4096] + - [23, 86.724] + - - [1024, 3538, 1, 4096] + - [6, 80.855] + - - [1024, 3996, 1, 1024] + - [22, 83.909] + - - [4096, 3169, 1, 1024] + - [7, 88.227] + - - [4096, 3538, 1, 1024] + - [7, 90.243] + - - [4096, 3401, 1, 1024] + - [7, 90.049] + - - [4096, 3581, 1, 1024] + - [7, 91.173] + - - [1024, 3180, 1, 4096] + - [40, 82.492] + - - [1024, 3870, 1, 1024] + - [23, 83.877] + - - [4096, 3555, 1, 1024] + - [7, 90.419] + - - [4096, 3412, 1, 1024] + - [7, 90.392] + - - [4096, 3302, 1, 1024] + - [7, 87.893] + - - [1024, 3561, 1, 4096] + - [6, 81.328] + - - [1024, 3302, 1, 4096] + - [28, 85.808] + - - [1024, 3976, 1, 4096] + - [6, 84.378] + - - [4096, 3485, 1, 1024] + - [7, 88.849] + - - [4096, 3534, 1, 1024] + - [7, 89.986] + - - [1024, 3110, 1, 4096] + - [22, 83.494] + - - [1024, 3401, 1, 4096] + - [23, 87.852] + - - [4096, 3216, 1, 1024] + - [7, 85.524] + - - [1024, 4020, 1, 33708] + - [14, 85.375] + - - [1024, 3215, 1, 4096] + - [40, 83.228] + - - [4096, 3566, 1, 1024] + - [7, 90.69] + - - [1024, 3137, 1, 4096] + - [40, 81.653] + - - [4096, 3359, 1, 1024] + - [7, 88.967] + - - [4096, 3392, 1, 1024] + - [7, 90.036] + - - [1024, 3506, 1, 4096] + - [35, 80.151] + - - [4096, 3233, 1, 1024] + - [7, 86.057] + - - [1024, 3444, 1, 4096] + - [36, 88.687] + - - [1024, 3975, 1, 4096] + - [6, 84.405] + - - [1024, 3870, 1, 33708] + - [1, 85.993] + - - [4096, 3465, 1, 1024] + - [7, 88.367] + - - [1024, 3523, 1, 4096] + - [22, 80.557] + - - [4096, 3990, 1, 1024] + - [7, 88.29] + - - [1024, 3549, 1, 4096] + - [22, 81.053] + - - [1024, 3342, 1, 4096] + - [23, 86.242] + - - [4096, 3476, 1, 1024] + - [7, 88.723] + - - [1024, 3418, 1, 4096] + - [23, 88.091] + - - [1024, 3859, 1, 1024] + - [7, 83.914] + - - [4096, 3339, 1, 1024] + - [7, 88.529] + - - [4096, 3452, 1, 1024] + - [7, 91.565] + - - [4096, 3293, 1, 1024] + - [7, 87.735] + - - [1024, 3369, 1, 4096] + - [7, 86.959] + - - [1024, 3544, 1, 4096] + - [22, 80.859] + - - [4096, 3493, 1, 1024] + - [7, 89.107] + - - [4096, 3350, 1, 1024] + - [7, 88.935] + - - [4096, 3256, 1, 1024] + - [7, 86.756] + - - [1024, 3870, 1, 4096] + - [23, 86.323] + - - [4096, 4012, 1, 1024] + - [7, 88.732] + - - [1024, 3280, 1, 4096] + - [40, 84.798] + - - [4096, 3456, 1, 1024] + - [7, 91.755] + - - [1024, 3555, 1, 4096] + - [6, 81.17] + - - [4096, 3014, 1, 1024] + - [7, 87.162] + - - [1024, 3474, 1, 4096] + - [22, 79.474] + - - [4096, 3367, 1, 1024] + - [7, 89.21] + - - [4096, 3432, 1, 1024] + - [7, 91.096] + - - [4096, 3273, 1, 1024] + - [7, 87.185] + - - [4096, 3130, 1, 1024] + - [7, 87.153] + - - [1024, 2984, 1, 4096] + - [35, 80.304] + - - [1024, 3995, 1, 1024] + - [22, 83.733] + - - [1024, 3517, 1, 4096] + - [35, 80.471] + - - [1024, 3455, 1, 4096] + - [7, 88.962] + - - [1024, 3939, 1, 4096] + - [23, 87.965] + - - [4096, 3147, 1, 1024] + - [7, 87.591] + - - [4096, 3516, 1, 1024] + - [7, 89.689] + - - [1024, 3876, 1, 4096] + - [7, 86.395] + - - [1024, 3191, 1, 4096] + - [28, 83.038] + - - [4096, 3411, 1, 1024] + - [7, 90.555] + - - [1024, 3337, 1, 4096] + - [7, 85.944] + - - [1024, 3512, 1, 4096] + - [6, 80.187] + - - [4096, 3301, 1, 1024] + - [7, 87.816] + - - [1024, 3450, 1, 4096] + - [23, 88.746] + - - [4096, 3533, 1, 1024] + - [7, 90.059] + - - [4096, 3390, 1, 1024] + - [7, 89.824] + - - [4096, 3231, 1, 1024] + - [7, 86.224] + - - [1024, 2499, 1, 4096] + - [22, 80.909] + - - [1024, 3186, 1, 4096] + - [28, 82.844] + - - [1024, 3380, 1, 4096] + - [23, 87.225] + - - [4096, 3496, 1, 1024] + - [7, 89.296] + - - [1024, 3956, 1, 33708] + - [7, 88.006] + - - [1024, 3976, 1, 1024] + - [35, 83.246] + - - [4096, 2736, 1, 1024] + - [7, 86.954] + - - [1024, 3291, 1, 4096] + - [40, 85.226] + - - [1024, 3944, 1, 33708] + - [7, 87.627] + - - [1024, 3485, 1, 4096] + - [35, 79.663] + - - [4096, 3138, 1, 1024] + - [7, 87.266] + - - [1024, 3423, 1, 4096] + - [23, 88.213] + - - [1024, 3491, 1, 4096] + - [22, 79.826] + - - [1024, 3860, 1, 4096] + - [7, 86.237] + - - [4096, 3211, 1, 1024] + - [7, 85.434] + - - [1024, 3221, 1, 4096] + - [40, 83.729] + - - [1024, 2917, 1, 4096] + - [6, 78.549] + - - [4096, 3475, 1, 1024] + - [7, 88.786] + - - [4096, 3524, 1, 1024] + - [7, 89.91] + - - [4096, 2985, 1, 1024] + - [7, 86.544] + - - [1024, 3480, 1, 4096] + - [22, 79.673] + - - [4096, 3222, 1, 1024] + - [7, 85.98] + - - [4096, 3451, 1, 1024] + - [7, 91.471] + - - [1024, 3969, 1, 33708] + - [14, 84.423] + - - [1024, 3640, 1, 1024] + - [35, 81.527] + - - [1024, 3297, 1, 4096] + - [28, 85.393] + - - [4096, 3944, 1, 1024] + - [7, 90.099] + - - [1024, 3216, 1, 4096] + - [40, 83.517] + - - [4096, 3349, 1, 1024] + - [7, 88.728] + - - [4096, 3398, 1, 1024] + - [7, 90.068] + - - [1024, 3154, 1, 4096] + - [40, 82.005] + - - [1024, 3978, 1, 33708] + - [14, 84.568] + - - [1024, 3348, 1, 4096] + - [23, 86.242] + - - [4096, 3304, 1, 1024] + - [7, 88.015] + - - [4096, 4030, 1, 1024] + - [7, 89.31] + - - [1024, 4026, 1, 1024] + - [35, 84.306] + - - [4096, 3471, 1, 1024] + - [7, 88.71] + - - [1024, 3259, 1, 4096] + - [40, 84.59] + - - [1024, 3308, 1, 4096] + - [40, 85.885] + - - [4096, 3391, 1, 1024] + - [7, 89.937] + - - [1024, 3312, 1, 4096] + - [40, 85.845] + - - [1024, 3502, 1, 4096] + - [22, 80.101] + - - [1024, 3968, 1, 33708] + - [7, 88.177] + - - [1024, 3424, 1, 4096] + - [23, 88.001] + - - [4096, 4032, 1, 1024] + - [7, 88.989] + - - [1024, 3900, 1, 1024] + - [36, 84.288] + - - [4096, 3442, 1, 1024] + - [7, 91.326] + - - [1024, 3366, 1, 4096] + - [23, 86.738] + - - [4096, 3999, 1, 1024] + - [7, 88.493] + - - [1024, 3477, 1, 4096] + - [22, 79.546] + - - [1024, 2505, 1, 4096] + - [22, 80.954] + - - [4096, 3515, 1, 1024] + - [7, 89.634] + - - [1024, 3564, 1, 4096] + - [22, 81.337] + - - [4096, 3057, 1, 1024] + - [7, 88.461] + - - [1024, 3339, 1, 4096] + - [23, 86.156] + - - [4096, 3262, 1, 1024] + - [7, 86.833] + - - [1024, 4030, 1, 4096] + - [22, 85.511] + - - [1024, 3265, 1, 4096] + - [28, 84.816] + - - [1024, 3459, 1, 4096] + - [22, 79.136] + - - [4096, 3462, 1, 1024] + - [7, 88.249] + - - [1024, 3513, 1, 4096] + - [22, 80.304] + - - [1024, 3397, 1, 4096] + - [23, 87.735] + - - [4096, 3572, 1, 1024] + - [7, 91.024] + - - [4096, 3389, 1, 1024] + - [7, 89.846] + - - [4096, 3438, 1, 1024] + - [7, 91.209] + - - [1024, 3640, 1, 33708] + - [14, 83.169] + - - [1024, 3995, 1, 33708] + - [14, 84.96] + - - [1024, 3165, 1, 4096] + - [40, 82.046] + - - [4096, 3543, 1, 1024] + - [7, 90.302] + - - [4096, 3352, 1, 1024] + - [7, 88.822] + - - [1024, 3359, 1, 4096] + - [23, 86.584] + - - [1024, 3470, 1, 4096] + - [6, 79.289] + - - [1024, 3392, 1, 4096] + - [23, 87.631] + - - [4096, 3137, 1, 1024] + - [7, 87.383] + - - [4096, 3506, 1, 1024] + - [7, 89.422] + - - [1024, 3095, 1, 4096] + - [22, 83.092] + - - [1024, 3859, 1, 4096] + - [23, 85.984] + - - [4096, 3369, 1, 1024] + - [7, 89.359] + - - [1024, 3435, 1, 4096] + - [23, 88.52] + - - [1024, 3354, 1, 4096] + - [23, 86.67] + - - [1024, 3055, 1, 4096] + - [22, 82.158] + - - [4096, 3523, 1, 1024] + - [7, 89.833] + - - [4096, 3380, 1, 1024] + - [7, 89.81] + - - [1024, 3233, 1, 4096] + - [40, 83.986] + - - [4096, 3221, 1, 1024] + - [7, 86.088] + - - [4096, 3270, 1, 1024] + - [7, 87.171] + - - [4096, 3593, 1, 1024] + - [7, 88.358] + - - [1024, 3358, 1, 4096] + - [23, 86.598] + - - [1024, 3540, 1, 4096] + - [22, 80.949] + - - [4096, 3502, 1, 1024] + - [7, 89.373] + - - [4096, 2505, 1, 1024] + - [7, 88.191] + - - [4096, 3397, 1, 1024] + - [7, 90.216] + - - [1024, 3300, 1, 4096] + - [40, 85.484] + - - [4096, 3095, 1, 1024] + - [7, 86.273] + - - [1024, 3182, 1, 4096] + - [40, 82.759] + - - [1024, 3299, 1, 4096] + - [13, 85.705] + - - [1024, 3276, 1, 4096] + - [13, 85.186] + - - [1024, 3360, 1, 4096] + - [23, 86.788] + - - [4096, 3360, 1, 1024] + - [7, 89.174] + - - [4096, 2918, 1, 1024] + - [7, 88.669] + - - [1024, 3939, 1, 33708] + - [7, 87.595] + - - [4096, 3314, 1, 1024] + - [7, 88.042] + - - [1024, 3319, 1, 4096] + - [40, 85.966] + - - [1024, 3942, 1, 1024] + - [7, 85.398] + - - [1024, 3465, 1, 4096] + - [22, 79.289] + - - [4096, 3546, 1, 1024] + - [7, 90.379] + - - [1024, 3403, 1, 4096] + - [23, 87.676] + - - [1024, 3948, 1, 1024] + - [23, 85.772] + - - [4096, 3441, 1, 1024] + - [7, 91.051] + - - [1024, 3139, 1, 4096] + - [40, 81.468] + - - [1024, 3563, 1, 4096] + - [22, 81.477] + - - [1024, 3508, 1, 4096] + - [22, 80.173] + - - [1024, 3975, 1, 33708] + - [14, 84.581] + - - [1024, 3446, 1, 4096] + - [7, 88.592] + - - [1024, 3529, 1, 4096] + - [22, 80.602] + - - [4096, 3461, 1, 1024] + - [7, 88.24] + - - [1024, 3574, 1, 4096] + - [6, 81.468] + - - [1024, 3101, 1, 4096] + - [6, 83.192] + - - [1024, 3927, 1, 1024] + - [23, 85.439] + - - [4096, 3224, 1, 1024] + - [7, 85.881] + - - [4096, 3437, 1, 1024] + - [7, 91.204] + - - [4096, 3900, 1, 1024] + - [7, 89.269] + - - [1024, 3495, 1, 4096] + - [6, 79.871] + - - [1024, 3977, 1, 33708] + - [14, 84.441] + - - [1024, 3328, 1, 4096] + - [13, 86.553] + - - [4096, 3168, 1, 1024] + - [7, 88.042] + - - [1024, 4026, 1, 33708] + - [14, 85.502] + - - [1024, 3292, 1, 4096] + - [40, 85.204] + - - [1024, 3294, 1, 4096] + - [13, 85.529] + - - [4096, 3335, 1, 1024] + - [7, 88.479] + - - [4096, 3400, 1, 1024] + - [7, 90.289] + - - [1024, 3287, 1, 4096] + - [40, 85.127] + - - [1024, 3910, 1, 4096] + - [23, 87.347] + - - [1024, 3780, 1, 1024] + - [23, 82.348] + - - [4096, 3098, 1, 1024] + - [7, 86.291] + - - [1024, 3584, 1, 33708] + - [29, 81.973] + - - [1024, 3371, 1, 4096] + - [7, 86.738] + - - [1024, 3546, 1, 4096] + - [22, 81.085] + - - [1024, 4012, 1, 1024] + - [22, 83.977] + - - [4096, 3505, 1, 1024] + - [7, 89.422] + - - [4096, 3554, 1, 1024] + - [7, 90.74] + - - [4096, 3063, 1, 1024] + - [7, 88.619] + - - [1024, 3900, 1, 33708] + - [1, 86.788] + - - [1024, 3345, 1, 4096] + - [23, 85.998] + - - [1024, 3357, 1, 4096] + - [23, 86.751] + - - [1024, 3282, 1, 4096] + - [28, 85.276] + - - [4096, 3484, 1, 1024] + - [7, 88.813] + - - [1024, 3557, 1, 4096] + - [6, 81.161] + - - [1024, 3476, 1, 4096] + - [22, 79.442] + - - [1024, 3751, 1, 1024] + - [23, 81.513] + - - [4096, 3379, 1, 1024] + - [7, 89.513] + - - [4096, 3428, 1, 1024] + - [7, 90.974] + - - [4096, 3126, 1, 1024] + - [7, 87.018] + - - [1024, 3325, 1, 4096] + - [40, 86.242] + - - [4096, 3501, 1, 1024] + - [7, 89.382] + - - [4096, 3358, 1, 1024] + - [7, 89.098] + - - [1024, 3441, 1, 4096] + - [7, 88.737] + - - [1024, 3552, 1, 4096] + - [22, 81.22] + - - [4096, 3232, 1, 1024] + - [7, 86.219] + - - [1024, 3412, 1, 4096] + - [7, 88.055] + - - [1024, 3372, 1, 4096] + - [23, 87.162] + - - [1024, 3585, 1, 4096] + - [22, 81.879] + - - [4096, 3143, 1, 1024] + - [7, 87.577] + - - [4096, 3464, 1, 1024] + - [7, 88.403] + - - [1024, 3145, 1, 4096] + - [28, 81.856] + - - [4096, 3375, 1, 1024] + - [7, 89.689] + - - [4096, 2917, 1, 1024] + - [7, 88.615] + - - [4096, 3978, 1, 1024] + - [7, 88.127] + - - [1024, 2765, 1, 4096] + - [23, 84.658] + - - [1024, 3452, 1, 4096] + - [23, 88.98] + - - [4096, 3584, 1, 1024] + - [7, 91.75] + - - [4096, 3545, 1, 1024] + - [7, 90.465] + - - [1024, 3352, 1, 4096] + - [23, 86.305] + - - [4096, 3292, 1, 1024] + - [7, 87.658] + - - [1024, 3525, 1, 4096] + - [22, 80.512] + - - [1024, 3266, 1, 4096] + - [13, 84.911] + - - [1024, 3382, 1, 4096] + - [23, 87.261] + - - [4096, 3492, 1, 1024] + - [7, 89.088] + - - [4096, 3419, 1, 1024] + - [7, 90.415] + - - [1024, 3796, 1, 33708] + - [1, 84.726] + - - [1024, 3293, 1, 4096] + - [40, 84.951] + - - [4096, 3796, 1, 1024] + - [7, 89.842] + - - [1024, 3487, 1, 4096] + - [22, 79.654] + - - [4096, 3166, 1, 1024] + - [7, 87.938] + - - [1024, 3409, 1, 4096] + - [23, 87.744] + - - [1024, 3520, 1, 4096] + - [35, 80.579] + - - [1024, 3573, 1, 4096] + - [22, 81.581] + - - [4096, 3366, 1, 1024] + - [7, 89.4] + - - [4096, 3720, 1, 1024] + - [7, 88.326] + - - [4096, 3207, 1, 1024] + - [7, 85.578] + - - [4096, 3272, 1, 1024] + - [7, 87.266] + - - [1024, 3390, 1, 4096] + - [23, 87.505] + - - [4096, 3183, 1, 1024] + - [7, 88.506] + - - [4096, 3536, 1, 1024] + - [7, 90.243] + - - [4096, 3563, 1, 1024] + - [7, 90.735] + - - [1024, 3482, 1, 4096] + - [6, 79.623] + - - [4096, 3447, 1, 1024] + - [7, 91.245] + - - [4096, 3955, 1, 1024] + - [7, 90.374] + - - [4096, 4005, 1, 1024] + - [7, 88.457] + - - [1024, 3493, 1, 4096] + - [6, 79.853] + - - [4096, 3410, 1, 1024] + - [7, 90.419] + - - [1024, 3422, 1, 4096] + - [7, 88.042] + - - [1024, 3350, 1, 4096] + - [7, 86.368] + - - [4096, 3300, 1, 1024] + - [7, 87.803] + - - [4096, 3910, 1, 1024] + - [7, 89.296] + - - [1024, 3489, 1, 4096] + - [22, 79.763] + - - [4096, 3483, 1, 1024] + - [7, 88.728] + - - [4096, 3532, 1, 1024] + - [7, 90.104] + - - [4096, 3230, 1, 1024] + - [7, 85.993] + - - [4096, 3427, 1, 1024] + - [7, 90.974] + - - [1024, 3377, 1, 4096] + - [23, 87.126] + - - [1024, 3488, 1, 4096] + - [22, 79.754] + - - [1024, 3616, 1, 4096] + - [22, 82.425] + - - [1024, 3426, 1, 4096] + - [23, 88.299] + - - [4096, 3357, 1, 1024] + - [7, 88.985] + - - [4096, 3406, 1, 1024] + - [7, 90.253] + - - [1024, 3046, 1, 4096] + - [6, 81.802] + - - [1024, 3272, 1, 4096] + - [40, 84.811] + - - [1024, 3256, 1, 4096] + - [28, 84.613] + - - [4096, 3247, 1, 1024] + - [7, 86.548] + - - [4096, 3088, 1, 1024] + - [7, 85.971] + - - [1024, 3531, 1, 4096] + - [35, 80.593] + - - [4096, 3511, 1, 1024] + - [7, 89.598] + - - [1024, 3720, 1, 33708] + - [1, 82.885] + - - [1024, 3267, 1, 4096] + - [40, 84.509] + - - [1024, 3270, 1, 4096] + - [13, 84.911] + - - [1024, 3461, 1, 4096] + - [22, 79.266] + - - [4096, 3474, 1, 1024] + - [7, 88.606] + - - [4096, 2984, 1, 1024] + - [7, 86.544] + - - [1024, 3399, 1, 4096] + - [23, 87.645] + - - [4096, 3574, 1, 1024] + - [7, 90.943] + - - [1024, 3876, 1, 1024] + - [7, 84.175] + - - [4096, 3337, 1, 1024] + - [7, 88.637] + - - [4096, 3450, 1, 1024] + - [7, 91.561] + - - [1024, 3720, 1, 1024] + - [23, 81.148] + - - [1024, 4059, 1, 1024] + - [22, 80.065] + - - [4096, 3291, 1, 1024] + - [7, 87.591] + - - [4096, 3995, 1, 1024] + - [7, 88.439] + - - [4096, 3491, 1, 1024] + - [7, 89.134] + - - [4096, 3348, 1, 1024] + - [7, 88.958] + - - [4096, 3925, 1, 1024] + - [7, 89.693] + - - [4096, 3894, 1, 1024] + - [7, 89.025] + - - [1024, 3456, 1, 4096] + - [23, 89.161] + - - [1024, 3394, 1, 4096] + - [23, 87.564] + - - [4096, 3165, 1, 1024] + - [7, 87.979] + - - [4096, 3470, 1, 1024] + - [7, 88.66] + - - [1024, 3014, 1, 4096] + - [6, 80.958] + - - [1024, 3375, 1, 4096] + - [23, 87.225] + - - [4096, 3859, 1, 1024] + - [7, 88.155] + - - [4096, 3365, 1, 1024] + - [7, 89.418] + - - [1024, 3162, 1, 4096] + - [28, 82.217] + - - [1024, 3840, 1, 33708] + - [1, 85.628] + - - [1024, 3437, 1, 4096] + - [7, 88.394] + - - [4096, 3319, 1, 1024] + - [7, 88.263] + - - [1024, 3320, 1, 4096] + - [40, 85.948] + - - [4096, 3328, 1, 1024] + - [7, 88.885] + - - [1024, 3235, 1, 4096] + - [40, 83.995] + - - [4096, 3282, 1, 1024] + - [7, 87.356] + - - [1024, 3367, 1, 4096] + - [23, 86.63] + - - [1024, 3542, 1, 4096] + - [22, 81.003] + - - [4096, 3145, 1, 1024] + - [7, 87.582] + - - [4096, 3514, 1, 1024] + - [7, 89.729] + - - [1024, 3432, 1, 4096] + - [23, 88.52] + - - [4096, 3409, 1, 1024] + - [7, 90.474] + - - [1024, 4012, 1, 33708] + - [14, 85.267] + - - [4096, 3876, 1, 1024] + - [7, 88.57] + - - [4096, 3299, 1, 1024] + - [7, 87.78] + - - [1024, 3168, 1, 4096] + - [40, 82.316] + - - [4096, 3681, 1, 1024] + - [7, 90.302] + - - [4096, 3531, 1, 1024] + - [7, 89.869] + - - [4096, 3388, 1, 1024] + - [7, 89.81] + - - [1024, 3720, 1, 4096] + - [7, 83.052] + - - [1024, 3332, 1, 4096] + - [23, 86.043] + - - [1024, 3273, 1, 4096] + - [13, 85.041] + - - [1024, 2935, 1, 4096] + - [6, 79.072] + - - [1024, 3467, 1, 4096] + - [6, 79.28] + - - [4096, 3542, 1, 1024] + - [7, 90.262] + - - [1024, 3130, 1, 4096] + - [22, 84.017] + - - [1024, 3405, 1, 4096] + - [23, 87.902] + - - [1024, 3960, 1, 1024] + - [7, 85.678] + - - [4096, 3405, 1, 1024] + - [7, 90.428] + - - [1024, 10080, 1, 1024] + - [7, 88.714] + - - [36548, 1216, 1, 1024] + - [36, 87.284] + - - [1024, 2592, 1, 1024] + - [36, 76.717] + - - [1024, 1568, 1, 1024] + - [24, 74.66] + - - [1024, 4445, 1, 1024] + - [23, 85.109] + - - [1024, 6272, 1, 1024] + - [23, 89.043] + - - [36548, 3584, 1, 1024] + - [36, 92.982] + - - [1024, 1827, 1, 1024] + - [35, 74.407] + - - [1024, 3220, 1, 1024] + - [23, 81.071] + - - [1024, 1856, 1, 1024] + - [22, 76.036] + - - [1024, 1760, 1, 1024] + - [35, 71.791] + - - [36548, 4235, 1, 1024] + - [36, 90.478] + - - [1024, 1984, 1, 1024] + - [22, 80.873] + - - [1024, 14720, 1, 1024] + - [23, 90.465] + - - [1024, 1152, 1, 1024] + - [24, 79.009] + - - [36548, 14976, 1, 1024] + - [36, 93.293] + - - [36548, 1152, 1, 1024] + - [4, 91.209] + - - [1024, 3392, 1, 1024] + - [7, 83.061] + - - [1024, 1408, 1, 1024] + - [22, 78.017] + - - [1024, 2080, 1, 1024] + - [9, 76.519] + - - [1024, 1824, 1, 1024] + - [22, 74.362] + - - [36548, 2432, 1, 1024] + - [36, 92.418] + - - [36548, 1827, 1, 1024] + - [36, 87.884] + - - [1024, 10176, 1, 1024] + - [23, 89.138] + - - [1024, 1952, 1, 1024] + - [35, 79.104] + - - [1024, 17024, 1, 1024] + - [7, 90.383] + - - [1024, 1472, 1, 1024] + - [24, 69.932] + - - [36548, 4459, 1, 1024] + - [36, 92.445] + - - [1024, 3712, 1, 1024] + - [35, 83.544] + - - [36548, 12928, 1, 1024] + - [36, 93.244] + - - [1024, 1632, 1, 1024] + - [24, 76.28] + - - [1024, 1696, 1, 1024] + - [22, 79.095] + - - [36548, 1764, 1, 1024] + - [36, 90.65] + - - [1024, 2944, 1, 1024] + - [35, 78.378] + - - [36548, 14080, 1, 1024] + - [36, 93.257] + - - [1024, 1280, 1, 1024] + - [6, 70.843] + - - [1024, 13440, 1, 1024] + - [23, 89.206] + - - [36548, 9120, 1, 1024] + - [36, 92.301] + - - [1024, 3008, 1, 1024] + - [6, 79.894] + - - [1024, 2560, 1, 1024] + - [35, 81.5] + - - [1024, 2208, 1, 1024] + - [7, 80.8] + - - [1024, 1920, 1, 1024] + - [22, 78.603] + - - [36548, 2496, 1, 1024] + - [36, 90.415] + - - [1024, 2016, 1, 1024] + - [23, 73.401] + - - [1024, 1184, 1, 1024] + - [6, 65.253] + - - [1024, 1664, 1, 1024] + - [8, 79.388] + - - [1024, 11424, 1, 1024] + - [23, 90.555] + - - [1024, 1216, 1, 1024] + - [6, 67.969] + - - [36548, 3185, 1, 1024] + - [36, 92.378] + - - [36548, 9216, 1, 1024] + - [36, 93.253] + - - [1024, 3200, 1, 1024] + - [7, 80.697] + - - [1024, 2656, 1, 1024] + - [36, 78.346] + - - [1024, 2368, 1, 1024] + - [22, 76.126] + - - [1024, 4459, 1, 1024] + - [7, 85.479] + - - [1024, 3808, 1, 1024] + - [23, 82.912] + - - [1024, 2336, 1, 1024] + - [35, 74.371] + - - [1024, 2304, 1, 1024] + - [28, 85.475] + - - [1024, 1560, 1, 1024] + - [24, 74.353] + - - [1024, 2496, 1, 1024] + - [35, 79.582] + - - [1024, 1504, 1, 1024] + - [24, 71.281] + - - [1024, 3232, 1, 1024] + - [23, 81.391] + - - [36548, 1015, 1, 1024] + - [36, 90.898] + - - [1024, 2000, 1, 1024] + - [23, 72.941] + - - [36548, 243, 1, 1024] + - [11, 83.814] + - - [1024, 13184, 1, 1024] + - [23, 91.11] + - - [1024, 2688, 1, 1024] + - [7, 80.354] + - - [36548, 950, 1, 1024] + - [29, 85.786] + - - [1024, 1764, 1, 1024] + - [35, 71.66] + - - [1024, 1376, 1, 1024] + - [6, 75.657] + - - [36548, 774, 1, 1024] + - [29, 80.602] + - - [1024, 4256, 1, 1024] + - [35, 83.097] + - - [36548, 3712, 1, 1024] + - [36, 92.833] + - - [1024, 3360, 1, 1024] + - [7, 82.294] + - - [1024, 2784, 1, 1024] + - [23, 82.339] + - - [1024, 4992, 1, 1024] + - [23, 86.102] + - - [36548, 1102, 1, 1024] + - [36, 87.694] + - - [1024, 1536, 1, 1024] + - [8, 73.103] + - - [1024, 2720, 1, 1024] + - [36, 80.304] + - - [1024, 2752, 1, 1024] + - [23, 81.617] + - - [1024, 2816, 1, 1024] + - [23, 83.882] + - - [1024, 2624, 1, 1024] + - [23, 78.066] + - - [1024, 2144, 1, 1024] + - [25, 78.554] + - - [36548, 1131, 1, 1024] + - [19, 89.738] + - - [1024, 3296, 1, 1024] + - [23, 81.017] + - - [36548, 4992, 1, 1024] + - [36, 93.099] + - - [1024, 1344, 1, 1024] + - [22, 73.866] + - - [36548, 2401, 1, 1024] + - [36, 91.164] + - - [1024, 15744, 1, 1024] + - [36, 89.716] + - - [1024, 15232, 1, 1024] + - [23, 90.406] + - - [1024, 1888, 1, 1024] + - [35, 76.654] + - - [1024, 1792, 1, 1024] + - [22, 73.694] + - - [36548, 1073, 1, 1024] + - [36, 85.637] + - - [36548, 15488, 1, 1024] + - [36, 93.235] + - - [1024, 2464, 1, 1024] + - [35, 78.351] + - - [1024, 2272, 1, 1024] + - [23, 82.781] + - - [1024, 2432, 1, 1024] + - [35, 77.71] + - - [1024, 3936, 1, 1024] + - [7, 85.587] + - - [36548, 13824, 1, 1024] + - [36, 93.298] + - - [1024, 2401, 1, 1024] + - [35, 76.388] + - - [1024, 2176, 1, 1024] + - [23, 80.615] + - - [1024, 2240, 1, 1024] + - [7, 81.793] + - - [1024, 1728, 1, 1024] + - [22, 81.166] + - - [1024, 2528, 1, 1024] + - [35, 80.408] + - - [1024, 2400, 1, 1024] + - [35, 76.677] + - - [1024, 1440, 1, 1024] + - [24, 68.912] + - - [1024, 2912, 1, 1024] + - [22, 77.299] + - - [1024, 2880, 1, 1024] + - [22, 83.467] + - - [1024, 4064, 1, 1024] + - [35, 80.142] + - - [1024, 4655, 1, 1024] + - [22, 80.891] + - - [36548, 6272, 1, 1024] + - [36, 93.167] + - - [768, 2048, 1, 3072] + - [276, 76.427] + - - [768, 4096, 1, 3072] + - [22, 82.483] + - - [6272, 256, 1, 528] + - [16, 74.646] + - - [3136, 2048, 1, 1024] + - [23, 82.411] + - - [50176, 128, 1, 256] + - [36, 86.174] + - - [12544, 1024, 1, 256] + - [30, 88.89] + - - [12544, 256, 1, 1024] + - [0, 83.638] + - - [3136, 512, 1, 1024] + - [8, 74.868] + - - [3136, 2048, 1, 512] + - [23, 81.446] + - - [289, 384, 32, 1024] + - [23, 65.948] + - - [4096, 512, 1, 4096] + - [21, 79.239] + - - [50176, 512, 1, 256] + - [1, 89.959] + - - [12544, 1024, 1, 512] + - [1, 90.212] + - - [12544, 256, 1, 512] + - [14, 82.718] + - - [784, 128, 32, 256] + - [22, 67.829] + - - [4096, 512, 1, 9216] + - [21, 79.889] + - - [3136, 512, 1, 2048] + - [16, 75.788] + - - [1225, 192, 32, 384] + - [29, 80.751] + - - [8192, 320, 1, 1280] + - [14, 82.267] + - - [8192, 320, 1, 2048] + - [0, 82.456] + - - [8192, 384, 1, 1280] + - [0, 82.1] + - - [8192, 384, 1, 2048] + - [0, 82.357] + - - [8192, 448, 1, 2048] + - [14, 81.816] + - - [8192, 448, 1, 1280] + - [14, 81.549] + - - [256, 6400, 1, 4096] + - [23, 77.8] + - - [512, 3433, 1, 2048] + - [37, 82.668] + - - [512, 3439, 1, 2048] + - [24, 82.849] + - - [512, 3461, 1, 2048] + - [35, 71.894] + - - [512, 3479, 1, 2048] + - [22, 72.255] + - - [512, 3494, 1, 2048] + - [6, 72.458] + - - [512, 3520, 1, 2048] + - [6, 73.212] + - - [512, 3530, 1, 2048] + - [22, 73.266] + - - [512, 3541, 1, 2048] + - [35, 73.347] + - - [512, 3564, 1, 2048] + - [22, 73.83] + - - [512, 3776, 1, 2048] + - [35, 78.26] + - - [512, 3859, 1, 512] + - [6, 76.095] + - - [512, 3925, 1, 2048] + - [35, 80.958] + - - [512, 3944, 1, 2048] + - [6, 81.428] + - - [512, 3955, 1, 2048] + - [22, 81.549] + - - [512, 3969, 1, 2048] + - [13, 75.571] + - - [512, 3976, 1, 2048] + - [13, 75.72] + - - [2048, 1232, 1, 512] + - [29, 77.322] + - - [2048, 3165, 1, 512] + - [29, 81.892] + - - [512, 2387, 1, 512] + - [35, 64.31] + - - [512, 2418, 1, 512] + - [35, 64.73] + - - [512, 2418, 1, 2048] + - [35, 68.222] + - - [512, 2496, 1, 512] + - [35, 68.398] + - - [512, 2496, 1, 2048] + - [6, 70.658] + - - [512, 2790, 1, 2048] + - [6, 78.287] + - - [512, 2864, 1, 2048] + - [8, 69.729] + - - [512, 3092, 1, 2048] + - [8, 75.084] + - - [512, 3113, 1, 2048] + - [37, 75.607] + - - [512, 3137, 1, 2048] + - [37, 75.995] + - - [512, 3165, 1, 2048] + - [24, 76.668] + - - [512, 3166, 1, 2048] + - [24, 76.857] + - - [512, 3194, 1, 2048] + - [8, 77.344] + - - [512, 3219, 1, 2048] + - [8, 78.08] + - - [512, 3222, 1, 2048] + - [24, 77.994] + - - [512, 3234, 1, 2048] + - [36, 78.229] + - - [512, 3237, 1, 2048] + - [8, 78.459] + - - [512, 3242, 1, 2048] + - [37, 78.558] + - - [512, 3246, 1, 2048] + - [24, 78.581] + - - [512, 3249, 1, 2048] + - [24, 78.712] + - - [512, 3251, 1, 2048] + - [24, 78.761] + - - [512, 3257, 1, 2048] + - [23, 78.784] + - - [512, 3262, 1, 2048] + - [8, 79.009] + - - [512, 3268, 1, 2048] + - [8, 79.122] + - - [512, 3282, 1, 2048] + - [24, 79.429] + - - [512, 3286, 1, 2048] + - [8, 79.479] + - - [512, 3287, 1, 2048] + - [8, 79.519] + - - [512, 3293, 1, 2048] + - [8, 79.645] + - - [512, 3297, 1, 2048] + - [24, 79.668] + - - [512, 3307, 1, 2048] + - [37, 79.943] + - - [512, 3314, 1, 2048] + - [37, 79.984] + - - [512, 3315, 1, 2048] + - [8, 80.128] + - - [512, 3319, 1, 2048] + - [8, 79.966] + - - [512, 3322, 1, 2048] + - [37, 80.11] + - - [512, 3323, 1, 2048] + - [36, 80.083] + - - [512, 3324, 1, 2048] + - [23, 80.2] + - - [512, 3325, 1, 2048] + - [24, 80.304] + - - [512, 3327, 1, 2048] + - [8, 80.255] + - - [512, 3329, 1, 2048] + - [37, 80.543] + - - [512, 3332, 1, 2048] + - [24, 80.597] + - - [512, 3336, 1, 2048] + - [8, 80.579] + - - [512, 3339, 1, 2048] + - [37, 80.719] + - - [512, 3342, 1, 2048] + - [8, 80.846] + - - [512, 3344, 1, 2048] + - [8, 80.814] + - - [512, 3358, 1, 2048] + - [8, 81.076] + - - [512, 3360, 1, 2048] + - [37, 81.202] + - - [512, 3364, 1, 2048] + - [8, 81.197] + - - [512, 3365, 1, 2048] + - [37, 81.306] + - - [512, 3369, 1, 2048] + - [24, 81.351] + - - [512, 3370, 1, 2048] + - [8, 81.428] + - - [512, 3371, 1, 2048] + - [37, 81.36] + - - [512, 3374, 1, 2048] + - [8, 81.563] + - - [512, 3376, 1, 2048] + - [37, 81.554] + - - [512, 3377, 1, 2048] + - [8, 81.545] + - - [512, 3378, 1, 2048] + - [37, 81.608] + - - [512, 3381, 1, 2048] + - [37, 81.685] + - - [512, 3382, 1, 2048] + - [8, 81.834] + - - [512, 3383, 1, 2048] + - [24, 81.757] + - - [512, 3384, 1, 2048] + - [37, 81.689] + - - [512, 3385, 1, 2048] + - [8, 81.712] + - - [512, 3386, 1, 2048] + - [8, 81.789] + - - [512, 3388, 1, 2048] + - [24, 81.852] + - - [512, 3390, 1, 2048] + - [8, 81.793] + - - [512, 3391, 1, 2048] + - [8, 81.716] + - - [512, 3396, 1, 2048] + - [37, 82.145] + - - [512, 3399, 1, 2048] + - [24, 82.032] + - - [512, 3402, 1, 2048] + - [24, 82.249] + - - [512, 3410, 1, 2048] + - [24, 82.181] + - - [512, 3412, 1, 2048] + - [37, 82.163] + - - [512, 3414, 1, 2048] + - [37, 82.389] + - - [512, 3415, 1, 2048] + - [37, 82.51] + - - [512, 3418, 1, 2048] + - [24, 82.384] + - - [512, 3420, 1, 2048] + - [24, 82.506] + - - [512, 3422, 1, 2048] + - [24, 82.614] + - - [512, 3425, 1, 2048] + - [24, 82.601] + - - [512, 3426, 1, 2048] + - [37, 82.619] + - - [512, 3427, 1, 2048] + - [37, 82.592] + - - [512, 3428, 1, 2048] + - [8, 82.546] + - - [512, 3430, 1, 2048] + - [8, 82.506] + - - [512, 3431, 1, 2048] + - [24, 82.808] + - - [512, 3432, 1, 2048] + - [8, 82.957] + - - [512, 3438, 1, 2048] + - [24, 82.88] + - - [512, 3440, 1, 2048] + - [24, 82.804] + - - [512, 3443, 1, 2048] + - [37, 83.138] + - - [512, 3445, 1, 2048] + - [24, 82.934] + - - [512, 3447, 1, 2048] + - [8, 82.795] + - - [512, 3448, 1, 2048] + - [37, 82.695] + - - [512, 3450, 1, 2048] + - [24, 82.984] + - - [512, 3451, 1, 2048] + - [8, 83.119] + - - [512, 3452, 1, 2048] + - [8, 82.713] + - - [512, 3453, 1, 2048] + - [37, 83.304] + - - [512, 3455, 1, 2048] + - [24, 83.192] + - - [512, 3456, 1, 2048] + - [37, 83.309] + - - [512, 3457, 1, 2048] + - [22, 71.894] + - - [512, 3458, 1, 2048] + - [22, 71.908] + - - [512, 3459, 1, 2048] + - [35, 71.854] + - - [512, 3460, 1, 2048] + - [35, 71.93] + - - [512, 3462, 1, 2048] + - [6, 71.921] + - - [512, 3466, 1, 2048] + - [22, 72.21] + - - [512, 3467, 1, 2048] + - [6, 72.097] + - - [512, 3468, 1, 2048] + - [35, 72.219] + - - [512, 3470, 1, 2048] + - [35, 72.021] + - - [512, 3471, 1, 2048] + - [22, 72.729] + - - [512, 3472, 1, 2048] + - [22, 72.174] + - - [512, 3475, 1, 2048] + - [6, 72.097] + - - [512, 3476, 1, 2048] + - [22, 72.246] + - - [512, 3477, 1, 2048] + - [6, 72.67] + - - [512, 3478, 1, 2048] + - [35, 72.156] + - - [512, 3480, 1, 2048] + - [22, 72.327] + - - [512, 3481, 1, 2048] + - [35, 72.291] + - - [512, 3483, 1, 2048] + - [35, 72.364] + - - [512, 3484, 1, 2048] + - [6, 72.341] + - - [512, 3487, 1, 2048] + - [22, 72.454] + - - [512, 3489, 1, 2048] + - [22, 72.413] + - - [512, 3490, 1, 2048] + - [6, 72.503] + - - [512, 3491, 1, 2048] + - [22, 72.454] + - - [512, 3493, 1, 2048] + - [22, 72.345] + - - [512, 3495, 1, 2048] + - [22, 72.391] + - - [512, 3497, 1, 2048] + - [22, 72.585] + - - [512, 3498, 1, 2048] + - [6, 72.607] + - - [512, 3499, 1, 2048] + - [22, 72.679] + - - [512, 3501, 1, 2048] + - [35, 72.86] + - - [512, 3503, 1, 2048] + - [35, 72.86] + - - [512, 3505, 1, 2048] + - [6, 72.652] + - - [512, 3507, 1, 2048] + - [35, 72.864] + - - [512, 3508, 1, 2048] + - [35, 72.733] + - - [512, 3509, 1, 2048] + - [22, 73.009] + - - [512, 3510, 1, 2048] + - [35, 72.95] + - - [512, 3511, 1, 2048] + - [6, 72.864] + - - [512, 3513, 1, 2048] + - [6, 72.797] + - - [512, 3514, 1, 2048] + - [35, 72.625] + - - [512, 3515, 1, 2048] + - [6, 72.946] + - - [512, 3517, 1, 2048] + - [6, 72.918] + - - [512, 3518, 1, 2048] + - [6, 72.927] + - - [512, 3519, 1, 2048] + - [6, 72.801] + - - [512, 3523, 1, 2048] + - [6, 73.216] + - - [512, 3528, 1, 2048] + - [6, 73.189] + - - [512, 3529, 1, 2048] + - [22, 73.117] + - - [512, 3531, 1, 2048] + - [22, 73.112] + - - [512, 3532, 1, 2048] + - [22, 73.221] + - - [512, 3533, 1, 2048] + - [22, 73.334] + - - [512, 3534, 1, 2048] + - [22, 73.194] + - - [512, 3538, 1, 2048] + - [22, 73.361] + - - [512, 3539, 1, 2048] + - [6, 73.338] + - - [512, 3540, 1, 2048] + - [22, 73.325] + - - [512, 3547, 1, 2048] + - [22, 73.546] + - - [512, 3548, 1, 2048] + - [22, 73.541] + - - [512, 3552, 1, 2048] + - [22, 73.622] + - - [512, 3575, 1, 2048] + - [6, 74.01] + - - [512, 3598, 1, 2048] + - [22, 74.597] + - - [512, 3599, 1, 2048] + - [22, 74.52] + - - [512, 3608, 1, 2048] + - [22, 74.791] + - - [512, 3776, 1, 512] + - [35, 76.081] + - - [512, 3780, 1, 512] + - [6, 74.836] + - - [512, 3780, 1, 2048] + - [6, 78.202] + - - [512, 3780, 1, 33708] + - [0, 78.865] + - - [512, 3796, 1, 512] + - [22, 72.133] + - - [512, 3796, 1, 2048] + - [22, 78.581] + - - [512, 3796, 1, 33708] + - [0, 79.095] + - - [512, 3822, 1, 512] + - [35, 73.027] + - - [512, 3822, 1, 2048] + - [6, 78.725] + - - [512, 3822, 1, 33708] + - [14, 79.614] + - - [512, 3835, 1, 512] + - [35, 73.464] + - - [512, 3835, 1, 2048] + - [22, 79.054] + - - [512, 3840, 1, 512] + - [22, 77.29] + - - [512, 3840, 1, 2048] + - [22, 79.393] + - - [512, 3840, 1, 33708] + - [29, 79.961] + - - [512, 3859, 1, 2048] + - [22, 79.451] + - - [512, 3859, 1, 33708] + - [22, 80.273] + - - [512, 3864, 1, 512] + - [35, 73.852] + - - [512, 3864, 1, 2048] + - [35, 79.826] + - - [512, 3870, 1, 512] + - [35, 76.487] + - - [512, 3870, 1, 2048] + - [35, 79.871] + - - [512, 3870, 1, 33708] + - [14, 80.372] + - - [512, 3876, 1, 512] + - [6, 73.866] + - - [512, 3876, 1, 2048] + - [6, 79.961] + - - [512, 3876, 1, 33708] + - [29, 80.471] + - - [512, 3906, 1, 512] + - [35, 74.728] + - - [512, 3906, 1, 2048] + - [22, 80.701] + - - [512, 3906, 1, 33708] + - [0, 81.315] + - - [512, 3910, 1, 512] + - [35, 75.12] + - - [512, 3910, 1, 2048] + - [35, 80.755] + - - [512, 3910, 1, 33708] + - [0, 81.216] + - - [512, 3925, 1, 512] + - [35, 74.872] + - - [512, 3925, 1, 33708] + - [29, 81.41] + - - [512, 3927, 1, 512] + - [22, 75.107] + - - [512, 3942, 1, 512] + - [35, 78.071] + - - [512, 3942, 1, 2048] + - [6, 81.382] + - - [512, 3942, 1, 33708] + - [29, 81.834] + - - [512, 3944, 1, 512] + - [35, 75.382] + - - [512, 3944, 1, 33708] + - [14, 81.825] + - - [512, 3955, 1, 512] + - [22, 75.594] + - - [512, 3955, 1, 33708] + - [14, 82.118] + - - [512, 3968, 1, 512] + - [35, 79.167] + - - [512, 3968, 1, 2048] + - [35, 82.041] + - - [512, 3968, 1, 33708] + - [0, 82.393] + - - [512, 3969, 1, 512] + - [22, 70.293] + - - [512, 3969, 1, 33708] + - [34, 77.809] + - - [512, 3976, 1, 512] + - [22, 70.017] + - - [512, 3976, 1, 33708] + - [5, 77.936] + - - [512, 3977, 1, 512] + - [6, 69.666] + - - [512, 3977, 1, 2048] + - [40, 75.639] + - - [512, 3977, 1, 33708] + - [21, 78.021] + - - [512, 3978, 1, 512] + - [22, 69.869] + - - [512, 3978, 1, 2048] + - [13, 75.716] + - - [512, 3978, 1, 33708] + - [21, 78.026] + - - [512, 3990, 1, 512] + - [22, 70.478] + - - [512, 3990, 1, 2048] + - [40, 75.666] + - - [512, 3990, 1, 33708] + - [34, 78.278] + - - [512, 3995, 1, 512] + - [35, 70.924] + - - [512, 3995, 1, 2048] + - [40, 75.829] + - - [512, 3995, 1, 33708] + - [21, 78.382] + - - [512, 3996, 1, 512] + - [37, 70.518] + - - [512, 3996, 1, 2048] + - [40, 75.892] + - - [512, 3996, 1, 33708] + - [21, 78.427] + - - [512, 3999, 1, 512] + - [22, 70.802] + - - [512, 3999, 1, 2048] + - [40, 76.004] + - - [512, 3999, 1, 33708] + - [21, 78.441] + - - [512, 4005, 1, 512] + - [22, 71.015] + - - [512, 4005, 1, 2048] + - [40, 75.914] + - - [512, 4005, 1, 33708] + - [21, 78.581] + - - [512, 4012, 1, 512] + - [35, 70.992] + - - [512, 4012, 1, 2048] + - [13, 76.23] + - - [512, 4012, 1, 33708] + - [21, 78.675] + - - [512, 4020, 1, 512] + - [37, 71.028] + - - [512, 4020, 1, 2048] + - [13, 76.447] + - - [512, 4020, 1, 33708] + - [21, 78.878] + - - [512, 4026, 1, 512] + - [35, 71.114] + - - [512, 4026, 1, 2048] + - [28, 76.392] + - - [512, 4026, 1, 33708] + - [34, 78.964] + - - [512, 4030, 1, 512] + - [37, 71.267] + - - [512, 4030, 1, 2048] + - [13, 76.55] + - - [512, 4030, 1, 33708] + - [21, 79.036] + - - [512, 4032, 1, 512] + - [35, 71.971] + - - [512, 4032, 1, 2048] + - [13, 76.654] + - - [512, 4032, 1, 33708] + - [21, 79.104] + - - [512, 4050, 1, 512] + - [6, 71.384] + - - [512, 4059, 1, 512] + - [36, 72.039] + - - [2048, 644, 1, 512] + - [29, 69.273] + - - [2048, 668, 1, 512] + - [29, 71.791] + - - [2048, 714, 1, 512] + - [29, 65.871] + - - [2048, 720, 1, 512] + - [29, 66.476] + - - [2048, 722, 1, 512] + - [29, 66.584] + - - [2048, 781, 1, 512] + - [29, 71.52] + - - [2048, 848, 1, 512] + - [35, 67.721] + - - [2048, 872, 1, 512] + - [35, 69.499] + - - [2048, 936, 1, 512] + - [6, 74.299] + - - [2048, 980, 1, 512] + - [29, 70.162] + - - [2048, 1139, 1, 512] + - [29, 79.244] + - - [2048, 1184, 1, 512] + - [29, 74.525] + - - [2048, 1186, 1, 512] + - [29, 74.434] + - - [2048, 1279, 1, 512] + - [22, 78.991] + - - [2048, 1290, 1, 512] + - [23, 74.191] + - - [2048, 1327, 1, 512] + - [35, 75.874] + - - [2048, 1331, 1, 512] + - [23, 76.51] + - - [2048, 1341, 1, 512] + - [23, 77.178] + - - [2048, 1350, 1, 512] + - [23, 77.489] + - - [2048, 1359, 1, 512] + - [29, 77.742] + - - [2048, 1391, 1, 512] + - [23, 79.497] + - - [2048, 1424, 1, 512] + - [29, 74.655] + - - [2048, 1458, 1, 512] + - [29, 76.42] + - - [2048, 1462, 1, 512] + - [29, 76.713] + - - [2048, 1467, 1, 512] + - [29, 76.993] + - - [2048, 1472, 1, 512] + - [29, 77.895] + - - [2048, 1520, 1, 512] + - [29, 79.704] + - - [2048, 1596, 1, 512] + - [7, 77.552] + - - [2048, 1599, 1, 512] + - [7, 77.94] + - - [2048, 1615, 1, 512] + - [7, 78.603] + - - [2048, 1680, 1, 512] + - [29, 80.706] + - - [2048, 1709, 1, 512] + - [29, 81.847] + - - [2048, 1902, 1, 512] + - [23, 80.471] + - - [2048, 1917, 1, 512] + - [23, 80.846] + - - [2048, 2076, 1, 512] + - [29, 80.408] + - - [2048, 2195, 1, 512] + - [23, 81.883] + - - [2048, 2205, 1, 512] + - [7, 82.334] + - - [2048, 2418, 1, 512] + - [29, 83.255] + - - [2048, 2496, 1, 512] + - [23, 83.485] + - - [2048, 2790, 1, 512] + - [7, 84.414] + - - [2048, 2864, 1, 512] + - [29, 84.302] + - - [2048, 3092, 1, 512] + - [22, 82.655] + - - [2048, 3113, 1, 512] + - [29, 83.562] + - - [2048, 3137, 1, 512] + - [29, 81.053] + - - [2048, 3166, 1, 512] + - [29, 81.563] + - - [2048, 3194, 1, 512] + - [7, 82.285] + - - [2048, 3219, 1, 512] + - [29, 82.944] + - - [2048, 3222, 1, 512] + - [22, 82.804] + - - [2048, 3234, 1, 512] + - [29, 83.422] + - - [2048, 3237, 1, 512] + - [22, 83.038] + - - [2048, 3242, 1, 512] + - [29, 83.584] + - - [2048, 3246, 1, 512] + - [29, 83.575] + - - [2048, 3249, 1, 512] + - [29, 83.598] + - - [2048, 3251, 1, 512] + - [29, 83.742] + - - [2048, 3257, 1, 512] + - [29, 83.823] + - - [2048, 3262, 1, 512] + - [29, 83.895] + - - [2048, 3268, 1, 512] + - [23, 83.517] + - - [2048, 3282, 1, 512] + - [23, 83.756] + - - [2048, 3286, 1, 512] + - [7, 83.968] + - - [2048, 3287, 1, 512] + - [23, 84.067] + - - [2048, 3293, 1, 512] + - [7, 84.166] + - - [2048, 3297, 1, 512] + - [7, 84.532] + - - [2048, 3307, 1, 512] + - [7, 84.577] + - - [2048, 3314, 1, 512] + - [7, 84.365] + - - [2048, 3315, 1, 512] + - [7, 84.708] + - - [2048, 3319, 1, 512] + - [7, 84.509] + - - [2048, 3322, 1, 512] + - [23, 84.577] + - - [2048, 3323, 1, 512] + - [23, 84.893] + - - [2048, 3324, 1, 512] + - [7, 84.875] + - - [2048, 3325, 1, 512] + - [7, 84.753] + - - [2048, 3327, 1, 512] + - [23, 84.811] + - - [2048, 3329, 1, 512] + - [7, 84.681] + - - [2048, 3332, 1, 512] + - [7, 84.829] + - - [2048, 3336, 1, 512] + - [23, 84.897] + - - [2048, 3339, 1, 512] + - [23, 85.402] + - - [2048, 3342, 1, 512] + - [23, 85.159] + - - [2048, 3344, 1, 512] + - [7, 85.199] + - - [2048, 3358, 1, 512] + - [7, 85.511] + - - [2048, 3360, 1, 512] + - [23, 85.849] + - - [2048, 3364, 1, 512] + - [23, 85.831] + - - [2048, 3365, 1, 512] + - [23, 85.885] + - - [2048, 3369, 1, 512] + - [7, 85.524] + - - [2048, 3370, 1, 512] + - [7, 85.714] + - - [2048, 3371, 1, 512] + - [7, 85.804] + - - [2048, 3374, 1, 512] + - [23, 86.097] + - - [2048, 3376, 1, 512] + - [23, 85.768] + - - [2048, 3377, 1, 512] + - [7, 85.772] + - - [2048, 3378, 1, 512] + - [7, 85.984] + - - [2048, 3381, 1, 512] + - [7, 86.133] + - - [2048, 3382, 1, 512] + - [23, 86.242] + - - [2048, 3383, 1, 512] + - [7, 86.205] + - - [2048, 3384, 1, 512] + - [7, 86.183] + - - [2048, 3385, 1, 512] + - [23, 86.187] + - - [2048, 3386, 1, 512] + - [23, 86.336] + - - [2048, 3388, 1, 512] + - [7, 86.463] + - - [2048, 3390, 1, 512] + - [7, 86.057] + - - [2048, 3391, 1, 512] + - [23, 86.499] + - - [2048, 3396, 1, 512] + - [23, 86.472] + - - [2048, 3399, 1, 512] + - [23, 86.566] + - - [2048, 3402, 1, 512] + - [23, 86.873] + - - [2048, 3410, 1, 512] + - [23, 86.797] + - - [2048, 3412, 1, 512] + - [23, 86.86] + - - [2048, 3414, 1, 512] + - [23, 86.715] + - - [2048, 3415, 1, 512] + - [23, 87.076] + - - [2048, 3418, 1, 512] + - [23, 86.851] + - - [2048, 3420, 1, 512] + - [7, 87.036] + - - [2048, 3422, 1, 512] + - [7, 86.936] + - - [2048, 3425, 1, 512] + - [7, 86.788] + - - [2048, 3426, 1, 512] + - [23, 87.478] + - - [2048, 3427, 1, 512] + - [23, 87.166] + - - [2048, 3428, 1, 512] + - [7, 87.013] + - - [2048, 3430, 1, 512] + - [23, 87.27] + - - [2048, 3431, 1, 512] + - [23, 87.428] + - - [2048, 3432, 1, 512] + - [7, 87.234] + - - [2048, 3433, 1, 512] + - [23, 87.685] + - - [2048, 3438, 1, 512] + - [7, 87.473] + - - [2048, 3439, 1, 512] + - [7, 87.379] + - - [2048, 3440, 1, 512] + - [23, 87.338] + - - [2048, 3443, 1, 512] + - [23, 87.721] + - - [2048, 3445, 1, 512] + - [23, 87.586] + - - [2048, 3447, 1, 512] + - [23, 87.609] + - - [2048, 3448, 1, 512] + - [23, 87.749] + - - [2048, 3450, 1, 512] + - [7, 87.636] + - - [2048, 3451, 1, 512] + - [23, 87.87] + - - [2048, 3452, 1, 512] + - [7, 87.586] + - - [2048, 3453, 1, 512] + - [23, 87.609] + - - [2048, 3455, 1, 512] + - [23, 87.915] + - - [2048, 3456, 1, 512] + - [7, 88.741] + - - [2048, 3457, 1, 512] + - [29, 82.357] + - - [2048, 3458, 1, 512] + - [23, 82.664] + - - [2048, 3459, 1, 512] + - [29, 82.168] + - - [2048, 3460, 1, 512] + - [23, 82.596] + - - [2048, 3461, 1, 512] + - [29, 82.362] + - - [2048, 3462, 1, 512] + - [23, 82.587] + - - [2048, 3466, 1, 512] + - [23, 82.673] + - - [2048, 3467, 1, 512] + - [29, 82.519] + - - [2048, 3468, 1, 512] + - [23, 82.578] + - - [2048, 3470, 1, 512] + - [23, 82.628] + - - [2048, 3471, 1, 512] + - [23, 82.7] + - - [2048, 3472, 1, 512] + - [29, 82.655] + - - [2048, 3475, 1, 512] + - [23, 82.885] + - - [2048, 3476, 1, 512] + - [29, 82.709] + - - [2048, 3477, 1, 512] + - [23, 82.858] + - - [2048, 3478, 1, 512] + - [23, 82.993] + - - [2048, 3479, 1, 512] + - [23, 82.998] + - - [2048, 3480, 1, 512] + - [23, 83.115] + - - [2048, 3481, 1, 512] + - [23, 83.07] + - - [2048, 3483, 1, 512] + - [29, 83.002] + - - [2048, 3484, 1, 512] + - [23, 82.993] + - - [2048, 3487, 1, 512] + - [29, 82.88] + - - [2048, 3489, 1, 512] + - [29, 82.993] + - - [2048, 3490, 1, 512] + - [23, 83.187] + - - [2048, 3491, 1, 512] + - [23, 82.984] + - - [2048, 3493, 1, 512] + - [23, 83.291] + - - [2048, 3494, 1, 512] + - [29, 83.214] + - - [2048, 3495, 1, 512] + - [23, 83.273] + - - [2048, 3497, 1, 512] + - [23, 83.295] + - - [2048, 3498, 1, 512] + - [29, 83.291] + - - [2048, 3501, 1, 512] + - [29, 83.354] + - - [2048, 3503, 1, 512] + - [23, 83.494] + - - [2048, 3505, 1, 512] + - [23, 83.232] + - - [2048, 3507, 1, 512] + - [23, 83.539] + - - [2048, 3508, 1, 512] + - [23, 83.489] + - - [2048, 3509, 1, 512] + - [23, 83.512] + - - [2048, 3510, 1, 512] + - [23, 83.602] + - - [2048, 3511, 1, 512] + - [23, 83.593] + - - [2048, 3513, 1, 512] + - [23, 83.489] + - - [2048, 3514, 1, 512] + - [29, 83.53] + - - [2048, 3515, 1, 512] + - [23, 83.832] + - - [2048, 3517, 1, 512] + - [23, 83.873] + - - [2048, 3518, 1, 512] + - [23, 83.652] + - - [2048, 3519, 1, 512] + - [23, 83.945] + - - [2048, 3520, 1, 512] + - [29, 83.823] + - - [2048, 3523, 1, 512] + - [23, 83.972] + - - [2048, 3528, 1, 512] + - [23, 84.049] + - - [2048, 3529, 1, 512] + - [23, 84.013] + - - [2048, 3530, 1, 512] + - [23, 84.189] + - - [2048, 3531, 1, 512] + - [29, 83.891] + - - [2048, 3532, 1, 512] + - [29, 84.121] + - - [2048, 3533, 1, 512] + - [23, 84.311] + - - [2048, 3534, 1, 512] + - [23, 84.117] + - - [2048, 3538, 1, 512] + - [23, 84.356] + - - [2048, 3539, 1, 512] + - [23, 84.148] + - - [2048, 3540, 1, 512] + - [7, 84.117] + - - [2048, 3541, 1, 512] + - [29, 84.306] + - - [2048, 3547, 1, 512] + - [29, 84.545] + - - [2048, 3548, 1, 512] + - [23, 84.509] + - - [2048, 3552, 1, 512] + - [29, 84.496] + - - [2048, 3564, 1, 512] + - [29, 84.653] + - - [2048, 3575, 1, 512] + - [29, 84.843] + - - [2048, 3598, 1, 512] + - [23, 84.983] + - - [2048, 3599, 1, 512] + - [23, 85.082] + - - [2048, 3608, 1, 512] + - [7, 85.217] + - - [2048, 3776, 1, 512] + - [29, 83.72] + - - [2048, 3780, 1, 512] + - [7, 83.76] + - - [2048, 3796, 1, 512] + - [23, 83.846] + - - [2048, 3822, 1, 512] + - [23, 84.658] + - - [2048, 3835, 1, 512] + - [7, 84.599] + - - [2048, 3840, 1, 512] + - [7, 85.66] + - - [2048, 3859, 1, 512] + - [23, 84.92] + - - [2048, 3864, 1, 512] + - [23, 85.357] + - - [2048, 3870, 1, 512] + - [7, 85.204] + - - [2048, 3876, 1, 512] + - [23, 85.479] + - - [2048, 3906, 1, 512] + - [7, 86.052] + - - [2048, 3910, 1, 512] + - [23, 86.269] + - - [2048, 3925, 1, 512] + - [7, 86.228] + - - [2048, 3942, 1, 512] + - [23, 86.661] + - - [2048, 3944, 1, 512] + - [23, 86.914] + - - [2048, 3955, 1, 512] + - [23, 87.108] + - - [2048, 3968, 1, 512] + - [23, 87.843] + - - [2048, 3969, 1, 512] + - [29, 84.229] + - - [2048, 3976, 1, 512] + - [29, 84.473] + - - [2048, 3977, 1, 512] + - [29, 84.523] + - - [2048, 3978, 1, 512] + - [29, 84.468] + - - [2048, 3990, 1, 512] + - [22, 84.45] + - - [2048, 3995, 1, 512] + - [29, 84.897] + - - [2048, 3996, 1, 512] + - [29, 84.933] + - - [2048, 3999, 1, 512] + - [29, 84.87] + - - [2048, 4005, 1, 512] + - [29, 85.041] + - - [2048, 4012, 1, 512] + - [29, 85.235] + - - [2048, 4020, 1, 512] + - [29, 85.299] + - - [2048, 4026, 1, 512] + - [29, 85.416] + - - [2048, 4030, 1, 512] + - [29, 85.344] + - - [2048, 4032, 1, 512] + - [29, 85.876] + - - [1024, 4096, 1, 3072] + - [22, 81.617] + - - [1024, 3840, 1, 1024] + - [23, 83.918] + - - [1024, 3840, 1, 4096] + - [7, 85.808] + - - [1024, 3968, 1, 1024] + - [23, 86.436] + - - [1024, 3968, 1, 4096] + - [7, 88.696] + - - [1024, 3968, 1, 42720] + - [30, 87.947] + - - [1024, 7200, 1, 1024] + - [23, 86.16] + - - [1024, 7200, 1, 4096] + - [23, 87.442] + - - [1024, 7200, 1, 42720] + - [15, 88.258] + - - [1024, 8160, 1, 1024] + - [36, 85.515] + - - [1024, 8160, 1, 4096] + - [23, 86.982] + - - [1024, 9520, 1, 1024] + - [23, 88.177] + - - [1024, 9520, 1, 4096] + - [11, 88.606] + - - [1024, 9520, 1, 42720] + - [30, 89.923] + - - [1024, 10200, 1, 1024] + - [23, 89.251] + - - [1024, 10200, 1, 4096] + - [13, 89.874] + - - [4096, 3840, 1, 1024] + - [7, 90.961] + - - [4096, 3968, 1, 1024] + - [7, 90.965] + - - [4096, 7200, 1, 1024] + - [1, 90.28] + - - [4096, 8160, 1, 1024] + - [4, 91.168] + - - [4096, 9520, 1, 1024] + - [4, 90.889] + - - [4096, 10200, 1, 1024] + - [1, 91.006] + - - [1024, 2048, 1, 4096] + - [291, 80.07] + - - [1024, 2048, 1, 30528] + - [21, 80.399] + - - [1024, 4096, 1, 30528] + - [14, 81.987] + - - [1024, 10240, 1, 256] + - [7, 86.138] + - - [1024, 10496, 1, 256] + - [7, 85.754] + - - [1024, 11008, 1, 256] + - [7, 85.709] + - - [1024, 11264, 1, 256] + - [7, 87.162] + - - [1024, 11520, 1, 256] + - [4, 88.209] + - - [1024, 12288, 1, 256] + - [7, 86.972] + - - [1024, 13312, 1, 256] + - [7, 86.842] + - - [1024, 13568, 1, 256] + - [7, 87.69] + - - [1024, 14336, 1, 256] + - [4, 88.885] + - - [1024, 14592, 1, 256] + - [7, 87.712] + - - [1024, 14848, 1, 256] + - [7, 88.619] + - - [1024, 15104, 1, 256] + - [7, 87.73] + - - [1024, 1600, 1, 1024] + - [8, 75.968] + - - [1024, 1600, 1, 1] + - [6, 1.769] + - - [1024, 16128, 1, 256] + - [7, 89.012] + - - [1024, 17152, 1, 256] + - [7, 89.098] + - - [1024, 1792, 1, 256] + - [29, 70.694] + - - [1024, 18944, 1, 256] + - [4, 89.58] + - - [1024, 19712, 1, 256] + - [7, 88.746] + - - [1024, 19968, 1, 256] + - [4, 89.31] + - - [1024, 20480, 1, 256] + - [4, 89.138] + - - [1024, 2048, 1, 256] + - [29, 71.75] + - - [1024, 20992, 1, 256] + - [7, 89.224] + - - [1024, 21504, 1, 256] + - [7, 89.025] + - - [1024, 22016, 1, 256] + - [7, 89.152] + - - [1024, 23552, 1, 256] + - [11, 89.991] + - - [1024, 2560, 1, 256] + - [29, 78.806] + - - [1024, 28672, 1, 256] + - [4, 88.75] + - - [1024, 3072, 1, 256] + - [29, 78.847] + - - [1024, 3328, 1, 256] + - [29, 79.483] + - - [1024, 33536, 1, 256] + - [19, 87.816] + - - [1024, 3840, 1, 256] + - [29, 78.549] + - - [1024, 40448, 1, 256] + - [19, 88.164] + - - [1024, 4096, 1, 256] + - [29, 78.874] + - - [1024, 4608, 1, 256] + - [4, 83.589] + - - [1024, 4864, 1, 256] + - [29, 83.038] + - - [1024, 5120, 1, 256] + - [4, 83.476] + - - [1024, 5632, 1, 256] + - [7, 83.828] + - - [1024, 6144, 1, 256] + - [7, 83.679] + - - [1024, 6400, 1, 256] + - [29, 81.973] + - - [1024, 7168, 1, 256] + - [29, 84.324] + - - [1024, 7424, 1, 256] + - [4, 86.102] + - - [1024, 7680, 1, 256] + - [29, 84.026] + - - [1024, 7936, 1, 256] + - [11, 85.781] + - - [1024, 8192, 1, 256] + - [7, 84.004] + - - [1024, 8448, 1, 256] + - [4, 85.723] + - - [1024, 8704, 1, 256] + - [7, 84.076] + - - [1024, 8960, 1, 256] + - [7, 85.655] + - - [1024, 9728, 1, 256] + - [7, 87.415] + - - [1024, 9984, 1, 256] + - [4, 85.272] + - - [2048, 1024, 1, 1] + - [6, 1.841] + - - [2048, 1024, 1, 256] + - [14, 72.26] + - - [256, 8976, 1, 10240] + - [27, 81.419] + - - [256, 8976, 1, 10496] + - [13, 86.788] + - - [256, 8976, 1, 11008] + - [13, 86.869] + - - [256, 8976, 1, 11520] + - [13, 86.9] + - - [256, 8976, 1, 12288] + - [27, 81.382] + - - [256, 8976, 1, 14336] + - [27, 81.41] + - - [256, 8976, 1, 14848] + - [13, 87.099] + - - [256, 8976, 1, 15104] + - [28, 87.126] + - - [256, 8976, 1, 1536] + - [25, 82.38] + - - [256, 8976, 1, 15872] + - [13, 87.176] + - - [256, 8976, 1, 17152] + - [13, 87.189] + - - [256, 8976, 1, 19712] + - [40, 87.302] + - - [256, 8976, 1, 19968] + - [13, 87.23] + - - [256, 8976, 1, 20480] + - [12, 79.641] + - - [256, 8976, 1, 2048] + - [9, 83.395] + - - [256, 8976, 1, 20992] + - [13, 87.306] + - - [256, 8976, 1, 22016] + - [13, 87.324] + - - [256, 8976, 1, 2304] + - [9, 83.607] + - - [256, 8976, 1, 2560] + - [40, 85.235] + - - [256, 8976, 1, 26112] + - [13, 87.374] + - - [256, 8976, 1, 2816] + - [34, 84.293] + - - [256, 8976, 1, 3072] + - [40, 85.691] + - - [256, 8976, 1, 33536] + - [13, 87.288] + - - [256, 8976, 1, 4352] + - [13, 85.317] + - - [256, 8976, 1, 44505] + - [21, 87.649] + - - [256, 8976, 1, 4864] + - [13, 85.664] + - - [256, 8976, 1, 5376] + - [13, 85.921] + - - [256, 8976, 1, 5632] + - [40, 85.993] + - - [256, 8976, 1, 5888] + - [28, 86.088] + - - [256, 8976, 1, 6144] + - [11, 82.181] + - - [256, 8976, 1, 6656] + - [13, 86.269] + - - [256, 8976, 1, 7168] + - [27, 81.807] + - - [256, 8976, 1, 7424] + - [40, 86.445] + - - [256, 8976, 1, 8192] + - [39, 78.54] + - - [256, 8976, 1, 8448] + - [28, 86.494] + - - [256, 8976, 1, 8960] + - [40, 86.625] + - - [256, 8976, 1, 9472] + - [13, 86.702] + - - [256, 8976, 1, 9728] + - [40, 86.765] + - - [256, 8976, 1, 9984] + - [28, 86.783] + - - [3200, 1024, 1, 2048] + - [23, 82.384] + - - [4096, 1024, 1, 1] + - [6, 2.184] + - - [1024, 4096, 1, 4096] + - [276, 85.171] + - - [1024, 3072, 1, 3072] + - [35, 82.528] + - - [1024, 2048, 1, 3072] + - [28, 79.221] + - - [30528, 4096, 1, 1024] + - [36, 93.28] + - - [30528, 2048, 1, 1024] + - [36, 92.585] + - - [512, 32768, 1, 256] + - [36, 88.105] + - - [256, 32768, 1, 128] + - [29, 82.028] + - - [1024, 32768, 1, 512] + - [11, 89.819] + - - [1024, 32768, 1, 1024] + - [265, 92.563] + - - [479, 32768, 1, 1024] + - [39, 82.786] + - - [289, 128, 64, 768] + - [35, 60.552] + - - [289, 160, 64, 768] + - [22, 53.085] + - - [289, 192, 64, 768] + - [35, 63.579] + - - [3136, 256, 64, 64] + - [0, 62.248] + - - [784, 512, 64, 128] + - [29, 73.861] + - - [784, 128, 64, 512] + - [39, 72.648] + - - [196, 1024, 64, 256] + - [7, 64.486] + - - [196, 256, 64, 1024] + - [7, 59.478] + - - [3136, 256, 32, 64] + - [29, 81.843] + - - [784, 512, 32, 128] + - [2, 72.927] + - - [784, 128, 32, 512] + - [35, 69.796] + - - [196, 1024, 32, 256] + - [35, 63.349] + - - [256, 6912, 1, 4] + - [6, 6.944] + - - [512, 4096, 1, 256] + - [36, 71.366] + - - [1024, 4096, 1, 512] + - [22, 80.088] + - - [480, 4096, 1, 1024] + - [23, 71.118] + - - [512, 6912, 1, 256] + - [11, 81.879] + - - [1024, 6912, 1, 512] + - [23, 88.664] + - - [1024, 6912, 1, 1024] + - [23, 89.824] + - - [480, 6912, 1, 1024] + - [23, 81.811] + - - [256, 55296, 1, 128] + - [30, 85.817] + - - [512, 55296, 1, 256] + - [1, 88.141] + - - [1920, 2048, 1, 2048] + - [7, 84.635] + - - [2880, 3072, 1, 3072] + - [36, 86.765] + - - [3840, 4096, 1, 4096] + - [7, 91.714] + - - [7680, 8192, 1, 8192] + - [4, 92.716] + - - [2048, 2048, 1, 2048] + - [22, 81.301] + - - [3072, 3072, 1, 3072] + - [7, 92.143] + - - [4096, 4096, 1, 4096] + - [7, 91.367] + - - [8192, 8192, 1, 8192] + - [4, 92.788] + - - [1152, 1152, 1, 1152] + - [29, 72.454] + - - [1536, 1536, 1, 1536] + - [21, 87.415] + - - [1920, 1920, 1, 1920] + - [14, 82.158] + - - [2304, 2304, 1, 2304] + - [16, 86.806] + - - [2688, 2688, 1, 2688] + - [30, 86.932] + - - [3456, 3456, 1, 3456] + - [1, 89.761] + - - [3840, 3840, 1, 3840] + - [23, 89.571] + - - [4224, 4224, 1, 4224] + - [30, 91.583] + - - [4608, 4608, 1, 4608] + - [1, 93.348] + - - [4992, 4992, 1, 4992] + - [15, 92.468] + - - [5376, 5376, 1, 5376] + - [15, 92.111] + - - [5760, 5760, 1, 5760] + - [15, 92.901] + - - [6144, 6144, 1, 6144] + - [23, 92.612] + - - [6528, 6528, 1, 6528] + - [15, 93.271] + - - [6912, 6912, 1, 6912] + - [15, 93.009] + - - [7296, 7296, 1, 7296] + - [30, 93.496] + - - [7680, 7680, 1, 7680] + - [21, 93.09] + - - [1152, 1152, 1, 384] + - [29, 68.673] + - - [1536, 1536, 1, 384] + - [1, 82.51] + - - [1920, 1920, 1, 384] + - [29, 80.701] + - - [2304, 2304, 1, 384] + - [16, 84.856] + - - [2688, 2688, 1, 384] + - [1, 85.073] + - - [3072, 3072, 1, 384] + - [1, 90.194] + - - [3456, 3456, 1, 384] + - [30, 88.385] + - - [3840, 3840, 1, 384] + - [1, 88.371] + - - [4224, 4224, 1, 384] + - [15, 90.04] + - - [4608, 4608, 1, 384] + - [30, 91.719] + - - [4992, 4992, 1, 384] + - [15, 90.875] + - - [5376, 5376, 1, 384] + - [1, 90.677] + - - [5760, 5760, 1, 384] + - [15, 91.177] + - - [6144, 6144, 1, 384] + - [1, 90.505] + - - [6528, 6528, 1, 384] + - [1, 91.353] + - - [6912, 6912, 1, 384] + - [1, 91.074] + - - [7296, 7296, 1, 384] + - [1, 91.525] + - - [7680, 7680, 1, 384] + - [1, 91.895] + - - [8064, 8064, 1, 384] + - [1, 92.107] + - - [8448, 8448, 1, 384] + - [1, 92.283] + - - [8832, 8832, 1, 384] + - [15, 92.445] + - - [9216, 9216, 1, 384] + - [27, 91.047] + - - [9600, 9600, 1, 384] + - [1, 92.572] + - - [9984, 9984, 1, 384] + - [1, 92.594] + - - [10368, 10368, 1, 384] + - [1, 92.779] + - - [10752, 10752, 1, 384] + - [1, 92.711] + - - [11136, 11136, 1, 384] + - [1, 93.059] + - - [11520, 11520, 1, 384] + - [7, 92.941] + - - [11904, 11904, 1, 384] + - [1, 93.095] + - - [12288, 12288, 1, 384] + - [1, 92.459] + - - [12672, 12672, 1, 384] + - [1, 93.081] + - - [13056, 13056, 1, 384] + - [1, 93.059] + - - [13440, 13440, 1, 384] + - [15, 93.199] + - - [13824, 13824, 1, 384] + - [23, 92.653] + - - [14208, 14208, 1, 384] + - [7, 93.366] + - - [14592, 14592, 1, 384] + - [1, 93.145] + - - [14976, 14976, 1, 384] + - [1, 93.293] + - - [15360, 15360, 1, 384] + - [1, 92.978] + - - [15744, 15744, 1, 384] + - [1, 93.483] + - - [16128, 16128, 1, 384] + - [1, 93.271] + - - [16512, 16512, 1, 384] + - [15, 93.487] + - - [16896, 16896, 1, 384] + - [1, 93.375] + - - [17280, 17280, 1, 384] + - [1, 93.514] + - - [17664, 17664, 1, 384] + - [1, 93.456] + - - [18048, 18048, 1, 384] + - [15, 93.6] + - - [18432, 18432, 1, 384] + - [7, 92.811] + - - [18816, 18816, 1, 384] + - [15, 93.614] + - - [19200, 19200, 1, 384] + - [1, 93.465] + - - [19584, 19584, 1, 384] + - [1, 93.546] + - - [19968, 19968, 1, 384] + - [1, 93.438] + - - [20352, 20352, 1, 384] + - [15, 93.663] + - - [20736, 20736, 1, 384] + - [1, 93.528] + - - [21120, 21120, 1, 384] + - [15, 93.713] + - - [21504, 21504, 1, 384] + - [7, 93.334] + - - [21888, 21888, 1, 384] + - [1, 93.56] + - - [22272, 22272, 1, 384] + - [15, 93.587] + - - [22656, 22656, 1, 384] + - [15, 93.749] + - - [23040, 23040, 1, 384] + - [23, 93.271] + - - [8192, 1024, 1, 1024] + - [7, 86.476] + - - [8192, 4096, 1, 1024] + - [1, 91.895] + - - [16384, 16384, 1, 16384] + - [21, 86.472] + - - [1444, 256, 120, 128] + - [0, 71.375] + - - [1444, 256, 139, 128] + - [3, 60.155] + - - [1444, 256, 160, 128] + - [3, 60.204] + - - [1444, 256, 18, 128] + - [6, 74.877] + - - [1444, 256, 19, 128] + - [35, 75.928] + - - [1444, 256, 120, 256] + - [4, 80.57] + - - [1444, 256, 139, 256] + - [4, 80.85] + - - [1444, 256, 160, 256] + - [4, 80.471] + - - [1444, 256, 18, 256] + - [31, 79.361] + - - [1444, 256, 19, 256] + - [7, 79.235] + - - [361, 256, 120, 512] + - [40, 73.162] + - - [361, 256, 139, 512] + - [12, 71.511] + - - [361, 256, 160, 512] + - [11, 71.578] + - - [361, 256, 18, 512] + - [35, 72.147] + - - [361, 256, 19, 512] + - [35, 68.123] + - - [173280, 128, 1, 64] + - [35, 80.886] + - - [200716, 128, 1, 64] + - [0, 65.975] + - - [231040, 128, 1, 64] + - [7, 46.029] + - - [25992, 128, 1, 64] + - [6, 56.708] + - - [27436, 128, 1, 64] + - [16, 64.342] + - - [8192, 7680, 1, 8192] + - [4, 92.716] + - - [4096, 3840, 1, 4096] + - [7, 91.408] + - - [2048, 1920, 1, 2048] + - [7, 84.992] + - - [1024, 1280, 1, 2] + - [6, 3.113] + - - [1024, 1280, 1, 4096] + - [6, 72.765] + - - [4096, 1280, 1, 1024] + - [7, 88.24] + - - [1024, 4992, 1, 2] + - [0, 4.886] + - - [1024, 4992, 1, 4096] + - [23, 87.527] + - - [4096, 4992, 1, 1024] + - [7, 91.773] + - - [1024, 5120, 1, 2] + - [6, 4.665] + - - [1024, 5120, 1, 1024] + - [7, 87.582] + - - [1024, 5120, 1, 4096] + - [7, 88.547] + - - [4096, 5120, 1, 1024] + - [7, 91.317] + - - [1024, 5248, 1, 2] + - [6, 4.652] + - - [1024, 5248, 1, 1024] + - [35, 82.195] + - - [1024, 5248, 1, 4096] + - [7, 83.034] + - - [4096, 5248, 1, 1024] + - [7, 91.259] + - - [1024, 2560, 1, 2] + - [6, 3.916] + - - [1024, 2560, 1, 4096] + - [22, 82.912] + - - [4096, 2560, 1, 1024] + - [7, 90.248] + - - [1024, 1152, 1, 2] + - [35, 3.059] + - - [1024, 1152, 1, 4096] + - [24, 82.177] + - - [4096, 1152, 1, 1024] + - [7, 88.691] + - - [1024, 8192, 1, 1024] + - [23, 86.643] + - - [1024, 8192, 1, 4096] + - [265, 87.548] + - - [1024, 8192, 1, 33712] + - [1, 87.717] + - - [1024, 9600, 1, 1024] + - [23, 89.31] + - - [1024, 9600, 1, 4096] + - [11, 89.273] + - - [1024, 9600, 1, 33712] + - [30, 90.695] + - - [4096, 8192, 1, 1024] + - [262, 92.908] + - - [4096, 9600, 1, 1024] + - [4, 91.583] + - - [1024, 10064, 1, 1024] + - [23, 88.281] + - - [1024, 10064, 1, 4096] + - [23, 89.098] + - - [1024, 10080, 1, 4096] + - [13, 88.98] + - - [1024, 10080, 1, 42720] + - [21, 90.212] + - - [1024, 6528, 1, 1024] + - [7, 85.01] + - - [1024, 6528, 1, 4096] + - [23, 86.233] + - - [1024, 6528, 1, 42720] + - [21, 87.333] + - - [1024, 7104, 1, 1024] + - [23, 85.565] + - - [1024, 7104, 1, 4096] + - [23, 87.076] + - - [1024, 7104, 1, 42720] + - [15, 87.162] + - - [1024, 8064, 1, 1024] + - [7, 90.153] + - - [1024, 8064, 1, 4096] + - [11, 90.505] + - - [1024, 9216, 1, 1024] + - [23, 90.731] + - - [1024, 9216, 1, 4096] + - [28, 91.218] + - - [4096, 10064, 1, 1024] + - [1, 91.173] + - - [4096, 10080, 1, 1024] + - [4, 90.911] + - - [4096, 6528, 1, 1024] + - [4, 90.672] + - - [4096, 7104, 1, 1024] + - [4, 90.451] + - - [4096, 8064, 1, 1024] + - [1, 91.836] + - - [4096, 9216, 1, 1024] + - [4, 91.538] + - - [480, 32768, 1, 1024] + - [27, 83.219] + - - [2048, 960, 1, 2048] + - [22, 79.289] + - - [2048, 1024, 1, 30592] + - [34, 80.421] + - - [2048, 1024, 1, 6144] + - [28, 79.524] + - - [2048, 1024, 1, 8192] + - [13, 79.894] + - - [8192, 1024, 1, 2048] + - [7, 87.225] + - - [1024, 8192, 1, 30592] + - [7, 86.603] + - - [1024, 8192, 1, 3072] + - [23, 87.5] + - - [512, 512, 256, 64] + - [0, 57.164] + - - [1024, 2048, 1, 30592] + - [21, 80.381] + - - [1024, 4096, 1, 30592] + - [5, 81.789] + - - [512, 512, 128, 64] + - [0, 78.364] + - - [2560, 2048, 1, 1920] + - [30, 89.594] + - - [2560, 2048, 1, 2560] + - [23, 89.63] + - - [2560, 2048, 1, 7680] + - [36, 88.637] + - - [640, 2048, 1, 2560] + - [35, 72.201] + - - [512, 512, 40, 64] + - [0, 81.13] + - - [1536, 4096, 1, 1536] + - [36, 88.218] + - - [1536, 4096, 1, 4608] + - [23, 88.795] + - - [1536, 4096, 1, 50304] + - [4, 87.694] + - - [1536, 4096, 1, 6144] + - [23, 88.588] + - - [6144, 4096, 1, 1536] + - [1, 92.432] + - - [1024, 1024, 64, 96] + - [0, 69.756] + - - [1536, 8192, 1, 1536] + - [36, 89.652] + - - [1536, 8192, 1, 4608] + - [36, 89.959] + - - [1536, 8192, 1, 50304] + - [34, 90.122] + - - [1536, 8192, 1, 6144] + - [40, 89.792] + - - [6144, 8192, 1, 1536] + - [1, 93.176] + - - [1024, 1024, 128, 96] + - [0, 68.569] + - - [1024, 16384, 1, 1024] + - [280, 90.658] + - - [1024, 16384, 1, 3072] + - [23, 90.69] + - - [1024, 16384, 1, 4096] + - [265, 91.625] + - - [1024, 16384, 1, 50304] + - [4, 90.541] + - - [4096, 16384, 1, 1024] + - [262, 93.496] + - - [1024, 1024, 256, 64] + - [10, 49.701] + - - [1024, 2048, 1, 50304] + - [34, 80.39] + - - [1024, 1024, 32, 64] + - [0, 81.107] + - - [1024, 4096, 1, 50304] + - [21, 81.811] + - - [1024, 1024, 64, 64] + - [0, 67.586] + - - [1024, 8192, 1, 50304] + - [33, 86.404] + - - [1024, 1024, 128, 64] + - [0, 55.657] + - - [128, 128, 1024, 64] + - [20, 73.221] + - - [1024, 8192, 1, 30528] + - [15, 87.46] + - - [1024, 3456, 1, 1024] + - [7, 86.287] + - - [1024, 3456, 1, 512] + - [7, 84.942] + - - [256, 6912, 1, 128] + - [0, 70.766] + - - [480, 3456, 1, 1024] + - [24, 75.896] + - - [512, 3456, 1, 256] + - [35, 76.257] + - - [1024, 1280, 1, 30528] + - [14, 73.613] + - - [1024, 1600, 1, 30528] + - [2, 79.127] + - - [1024, 10240, 1, 1024] + - [23, 89.928] + - - [1024, 10240, 1, 4096] + - [27, 89.883] + - - [4096, 10240, 1, 1024] + - [1, 91.624] + - - [128, 128, 1280, 64] + - [23, 32.633] + - - [1024, 1640, 1, 30528] + - [37, 80.994] + - - [1024, 10496, 1, 1024] + - [7, 88.015] + - - [1024, 10496, 1, 4096] + - [23, 88.421] + - - [4096, 10496, 1, 1024] + - [1, 92.224] + - - [128, 128, 1312, 64] + - [36, 30.878] + - - [1024, 6144, 1, 4096] + - [265, 88.584] + - - [4096, 6144, 1, 1024] + - [262, 92.413] + - - [1024, 6144, 1, 1024] + - [7, 87.027] + - - [512, 512, 192, 64] + - [245, 67.9] + - - [256, 6912, 1, 1] + - [6, 1.692] + - - [3136, 128, 64, 64] + - [36, 49.724] + - - [3136, 256, 64, 128] + - [18, 66.783] + - - [784, 512, 64, 256] + - [4, 75.698] + - - [3136, 128, 64, 256] + - [32, 74.714] + - - [3136, 256, 64, 256] + - [19, 86.837] + - - [196, 1024, 64, 512] + - [27, 65.217] + - - [784, 256, 64, 512] + - [4, 75.21] + - - [784, 512, 64, 512] + - [4, 77.669] + - - [196, 512, 64, 1024] + - [27, 61.45] + - - [196, 1024, 64, 1024] + - [27, 64.621] + - - [3136, 128, 32, 64] + - [14, 78.084] + - - [3136, 256, 32, 128] + - [0, 83.688] + - - [784, 512, 32, 256] + - [36, 76.068] + - - [3136, 128, 32, 256] + - [4, 80.665] + - - [3136, 256, 32, 256] + - [19, 84.726] + - - [196, 1024, 32, 512] + - [36, 64.477] + - - [784, 256, 32, 512] + - [23, 74.263] + - - [784, 512, 32, 512] + - [36, 78.716] + - - [196, 512, 32, 1024] + - [35, 61.427] + - - [196, 1024, 32, 1024] + - [27, 63.787] + - - [1024, 10224, 1, 1024] + - [7, 89.445] + - - [1024, 10192, 1, 1024] + - [23, 89.661] + - - [1024, 10208, 1, 1024] + - [23, 89.77] + - - [1024, 10224, 1, 4096] + - [13, 89.928] + - - [1024, 10224, 1, 3072] + - [28, 89.666] + - - [4096, 10224, 1, 1024] + - [1, 91.186] + - - [1024, 10240, 1, 3072] + - [11, 89.923] + - - [1024, 10192, 1, 3072] + - [28, 89.589] + - - [4096, 10192, 1, 1024] + - [1, 91.344] + - - [1024, 10192, 1, 4096] + - [13, 89.779] + - - [1024, 10200, 1, 3072] + - [28, 89.612] + - - [1024, 10184, 1, 1024] + - [23, 89.043] + - - [4096, 10208, 1, 1024] + - [1, 91.223] + - - [1024, 10208, 1, 3072] + - [28, 89.535] + - - [1024, 10208, 1, 4096] + - [13, 89.842] + - - [1024, 10224, 1, 2048] + - [7, 90.298] + - - [1024, 10240, 1, 2048] + - [7, 90.347] + - - [1024, 10120, 1, 1024] + - [7, 88.461] + - - [1024, 10192, 1, 2048] + - [7, 90.086] + - - [1024, 10152, 1, 1024] + - [7, 88.791] + - - [1024, 10080, 1, 3072] + - [23, 89.319] + - - [100352, 512, 1, 256] + - [27, 89.612] + - - [12544, 2048, 1, 1024] + - [36, 91.836] + - - [200704, 512, 1, 256] + - [27, 90.695] + - - [25088, 1024, 1, 512] + - [1, 91.263] + - - [50176, 1024, 1, 512] + - [7, 91.204] + - - [6272, 2048, 1, 1024] + - [7, 90.902] + - - [196, 1024, 128, 256] + - [10, 61.427] + - - [196, 1024, 256, 256] + - [10, 61.973] + - - [196, 256, 128, 1024] + - [38, 57.533] + - - [196, 256, 256, 1024] + - [26, 59.889] + - - [196, 512, 128, 1024] + - [39, 63.2] + - - [196, 512, 256, 1024] + - [39, 64.288] + - - [3136, 128, 128, 256] + - [32, 73.311] + - - [3136, 128, 256, 256] + - [32, 72.341] + - - [784, 256, 128, 512] + - [11, 76.993] + - - [784, 256, 256, 512] + - [11, 78.698] + - - [128, 128, 2048, 64] + - [0, 31.162] + - - [1024, 2560, 1, 30528] + - [14, 82.506] + - - [128, 128, 1536, 64] + - [0, 36.18] + - - [1024, 12288, 1, 4096] + - [265, 90.198] + - - [1024, 12288, 1, 1024] + - [280, 89.158] + - - [4096, 12288, 1, 1024] + - [262, 92.83] + - - [1024, 1920, 1, 30528] + - [14, 80.169] + - - [128, 128, 192, 64] + - [14, 56.843] + - - [384, 384, 144, 64] + - [0, 83.783] + - - [768, 4608, 1, 2] + - [29, 4.061] + - - [3072, 4608, 1, 768] + - [1, 91.579] + - - [768, 4608, 1, 3072] + - [40, 89.472] + - - [768, 4608, 1, 768] + - [36, 87.004] + - - [512, 512, 48, 64] + - [29, 82.055] + - - [128, 128, 256, 64] + - [0, 64.698] + - - [384, 384, 192, 64] + - [0, 80.846] + - - [1024, 4608, 1, 2] + - [6, 4.462] + - - [4096, 4608, 1, 1024] + - [7, 92.197] + - - [1024, 4608, 1, 4096] + - [13, 90.442] + - - [1024, 4608, 1, 1024] + - [7, 88.015] + - - [3072, 256, 2, 1024] + - [31, 73.519] + - - [2852, 256, 2, 1024] + - [24, 67.856] + - - [3220, 256, 2, 1024] + - [8, 76.059] + - - [850, 2048, 2, 512] + - [35, 75.644] + - - [768, 2048, 2, 512] + - [35, 80.652] + - - [2904, 256, 2, 1024] + - [8, 69.539] + - - [805, 2048, 2, 512] + - [35, 71.493] + - - [864, 2048, 2, 512] + - [22, 76.907] + - - [2992, 256, 2, 1024] + - [7, 71.56] + - - [3400, 256, 2, 1024] + - [8, 80.782] + - - [4032, 256, 2, 1024] + - [5, 74.281] + - - [15200, 128, 2, 512] + - [7, 80.516] + - - [12288, 128, 2, 512] + - [29, 80.954] + - - [888, 2048, 2, 512] + - [22, 78.878] + - - [13600, 128, 2, 512] + - [23, 82.655] + - - [12880, 128, 2, 512] + - [1, 78.508] + - - [3456, 256, 2, 1024] + - [16, 83.061] + - - [2944, 256, 2, 1024] + - [23, 70.784] + - - [2688, 256, 2, 1024] + - [22, 74.507] + - - [13824, 128, 2, 512] + - [1, 86.472] + - - [3036, 256, 2, 1024] + - [8, 72.571] + - - [3168, 256, 2, 1024] + - [8, 75.283] + - - [3360, 256, 2, 1024] + - [8, 80.137] + - - [3552, 256, 2, 1024] + - [6, 72.431] + - - [11616, 128, 2, 512] + - [14, 75.869] + - - [4200, 256, 2, 1024] + - [7, 77.76] + - - [840, 2048, 2, 512] + - [35, 74.795] + - - [14208, 128, 2, 512] + - [29, 80.079] + - - [11968, 128, 2, 512] + - [29, 77.588] + - - [3264, 256, 2, 1024] + - [8, 77.254] + - - [713, 2048, 2, 512] + - [22, 74.168] + - - [13600, 256, 2, 512] + - [36, 87.221] + - - [12880, 256, 2, 512] + - [29, 82.871] + - - [12288, 256, 2, 512] + - [1, 86.891] + - - [2816, 256, 2, 1024] + - [6, 77.633] + - - [850, 2048, 1, 512] + - [6, 67.518] + - - [660, 2048, 2, 512] + - [22, 68.732] + - - [672, 2048, 2, 512] + - [22, 70.225] + - - [13440, 128, 2, 512] + - [1, 84.459] + - - [726, 2048, 2, 512] + - [35, 75.635] + - - [3500, 256, 2, 1024] + - [6, 71.114] + - - [13824, 256, 2, 512] + - [1, 89.982] + - - [15200, 256, 2, 512] + - [1, 84.514] + - - [3700, 256, 2, 1024] + - [35, 74.624] + - - [748, 2048, 2, 512] + - [22, 77.723] + - - [3600, 256, 2, 1024] + - [22, 72.882] + - - [4032, 1024, 2, 256] + - [7, 82.849] + - - [16128, 128, 2, 512] + - [31, 84.672] + - - [15200, 128, 1, 512] + - [0, 75.662] + - - [13600, 128, 1, 512] + - [2, 77.899] + - - [2904, 1024, 2, 256] + - [0, 80.855] + - - [2992, 1024, 2, 256] + - [1, 82.736] + - - [1536, 2048, 1, 1024] + - [22, 81.473] + - - [24576, 128, 1, 256] + - [0, 79.262] + - - [24576, 512, 1, 256] + - [1, 86.954] + - - [25760, 128, 1, 256] + - [16, 74.556] + - - [25760, 512, 1, 256] + - [30, 86.869] + - - [6144, 256, 1, 512] + - [29, 72.233] + - - [6440, 256, 1, 512] + - [37, 74.529] + - - [3036, 1024, 2, 256] + - [7, 82.037] + - - [13600, 512, 1, 128] + - [7, 81.536] + - - [9408, 512, 2, 128] + - [0, 82.813] + - - [56000, 256, 2, 64] + - [0, 81.391] + - - [2852, 1024, 2, 256] + - [29, 78.206] + - - [2816, 1024, 2, 256] + - [30, 84.766] + - - [60800, 256, 1, 64] + - [29, 83.259] + - - [2944, 1024, 2, 256] + - [14, 82.434] + - - [11776, 512, 2, 128] + - [29, 84.613] + - - [11616, 512, 2, 128] + - [29, 82.208] + - - [4200, 1024, 2, 256] + - [1, 84.622] + - - [54400, 256, 1, 64] + - [14, 82.267] + - - [15200, 512, 1, 128] + - [29, 79.799] + - - [2688, 1024, 2, 256] + - [29, 82.768] + - - [12672, 512, 2, 128] + - [30, 86.693] + - - [11968, 512, 2, 128] + - [22, 83.656] + - - [46464, 256, 2, 64] + - [29, 84.306] + - - [2400, 256, 2, 1024] + - [35, 66.056] + - - [2520, 256, 2, 1024] + - [35, 69.508] + - - [2400, 1024, 2, 256] + - [14, 81.545] + - - [10752, 128, 2, 512] + - [36, 79.537] + - - [45632, 256, 2, 64] + - [35, 82.073] + - - [2520, 1024, 2, 256] + - [1, 83.02] + - - [53760, 256, 2, 64] + - [0, 82.411] + - - [2352, 256, 2, 1024] + - [35, 64.752] + - - [47872, 256, 2, 64] + - [35, 83.341] + - - [47104, 256, 2, 64] + - [35, 83.341] + - - [50688, 256, 2, 64] + - [35, 83.652] + - - [45056, 256, 2, 64] + - [35, 83.201] + - - [13440, 512, 2, 128] + - [36, 85.389] + - - [2352, 1024, 2, 256] + - [6, 79.072] + - - [11264, 512, 2, 128] + - [1, 84.554] + - - [10560, 128, 2, 512] + - [14, 75.346] + - - [16128, 512, 2, 128] + - [1, 86.625] + - - [37632, 256, 2, 64] + - [29, 83.011] + - - [51520, 256, 2, 64] + - [35, 82.131] + - - [14000, 512, 2, 128] + - [1, 83.291] + - - [10560, 512, 2, 128] + - [6, 83.332] + - - [64512, 256, 2, 64] + - [0, 77.931] + - - [54400, 256, 2, 64] + - [14, 83.426] + - - [3264, 1024, 2, 256] + - [1, 81.703] + - - [10752, 512, 2, 128] + - [7, 84.306] + - - [3168, 1024, 2, 256] + - [1, 80.787] + - - [950, 2048, 1, 512] + - [7, 68.032] + - - [55296, 256, 2, 256] + - [39, 87.803] + - - [51520, 256, 2, 256] + - [33, 87.135] + - - [11408, 128, 2, 512] + - [16, 80.873] + - - [60800, 256, 2, 256] + - [33, 88.836] + - - [54400, 256, 2, 256] + - [33, 87.582] + - - [3700, 1024, 2, 256] + - [7, 83.751] + - - [60800, 256, 2, 64] + - [1, 72.733] + - - [3800, 1024, 1, 256] + - [6, 77.169] + - - [3400, 1024, 1, 256] + - [2, 79.943] + - - [3072, 1024, 2, 256] + - [1, 84.694] + - - [3600, 1024, 2, 256] + - [1, 83.652] + - - [12288, 512, 2, 128] + - [7, 84.279] + - - [49152, 256, 2, 256] + - [39, 85.872] + - - [12880, 512, 2, 128] + - [6, 82.646] + - - [11408, 512, 2, 128] + - [30, 84.035] + - - [42240, 256, 2, 64] + - [22, 83.309] + - - [1008, 2048, 2, 512] + - [22, 78.522] + - - [3360, 1024, 2, 256] + - [1, 84.053] + - - [14208, 512, 2, 128] + - [36, 86.404] + - - [56832, 256, 2, 64] + - [0, 80.958] + - - [43008, 256, 2, 64] + - [14, 81.978] + - - [13600, 512, 2, 128] + - [0, 83.638] + - - [3500, 1024, 2, 256] + - [14, 82.019] + - - [2640, 1024, 2, 256] + - [0, 80.746] + - - [13824, 512, 2, 128] + - [1, 86.196] + - - [3800, 256, 2, 1024] + - [22, 76.591] + - - [55296, 256, 2, 64] + - [0, 80.223] + - - [2640, 256, 2, 1024] + - [35, 72.395] + - - [15200, 512, 2, 128] + - [1, 85.542] + - - [3552, 1024, 2, 256] + - [1, 83.67] + - - [3220, 1024, 2, 256] + - [7, 81.031] + - - [3456, 1024, 2, 256] + - [1, 87.735] + - - [49152, 256, 2, 64] + - [35, 82.799] + - - [3400, 1024, 2, 256] + - [36, 82.98] + - - [950, 2048, 2, 512] + - [22, 74.006] + - - [3800, 1024, 2, 256] + - [6, 82.104] + - - [1610, 2048, 1, 1024] + - [7, 80.624] + - - [6912, 256, 1, 512] + - [0, 80.304] + - - [6800, 256, 1, 512] + - [31, 78.815] + - - [27648, 128, 1, 256] + - [7, 83.147] + - - [27200, 128, 1, 256] + - [2, 78.725] + - - [30400, 128, 1, 256] + - [8, 76.799] + - - [7600, 256, 1, 512] + - [0, 76.28] + - - [6144, 1024, 1, 512] + - [1, 86.819] + - - [6912, 1024, 1, 512] + - [1, 89.883] + - - [6440, 1024, 1, 512] + - [7, 83.07] + - - [27648, 512, 1, 256] + - [1, 89.58] + - - [1728, 2048, 1, 1024] + - [35, 77.818] + - - [27200, 512, 1, 256] + - [23, 87.487] + - - [6800, 1024, 1, 512] + - [36, 86.909] + - - [1700, 2048, 1, 1024] + - [22, 76.681] + - - [7600, 1024, 1, 512] + - [1, 84.41] + - - [30400, 512, 1, 256] + - [30, 87.491] + - - [1900, 2048, 1, 1024] + - [7, 82.655] + - - [12544, 1024, 1, 1024] + - [7, 91.191] + - - [1024, 1024, 160, 96] + - [0, 70.279] + - - [1920, 16384, 1, 25216] + - [15, 92.125] + - - [3840, 16384, 1, 1920] + - [1, 93.934] + - - [1920, 16384, 1, 3840] + - [23, 92.405] + - - [960, 16384, 1, 1920] + - [1, 85.47] + - - [1920, 16384, 1, 2880] + - [15, 92.847] + - - [1024, 1024, 40, 96] + - [0, 85.466] + - - [1920, 4096, 1, 25216] + - [21, 88.073] + - - [3840, 4096, 1, 1920] + - [15, 91.904] + - - [1920, 4096, 1, 3840] + - [34, 87.555] + - - [960, 4096, 1, 1920] + - [0, 76.271] + - - [1920, 4096, 1, 2880] + - [15, 87.924] + - - [1024, 1024, 80, 96] + - [0, 76.889] + - - [1920, 8192, 1, 25216] + - [30, 91.597] + - - [3840, 8192, 1, 1920] + - [1, 92.793] + - - [1920, 8192, 1, 3840] + - [23, 91.317] + - - [960, 8192, 1, 1920] + - [1, 81.924] + - - [1920, 8192, 1, 2880] + - [30, 92.147] + - - [1024, 1024, 96, 96] + - [0, 72.806] + - - [2304, 16384, 1, 12672] + - [30, 93.79] + - - [2304, 16384, 1, 2304] + - [1, 93.627] + - - [576, 16384, 1, 2304] + - [36, 81.387] + - - [2304, 16384, 1, 1728] + - [15, 93.853] + - - [1024, 1024, 24, 96] + - [7, 85.542] + - - [2304, 4096, 1, 12672] + - [21, 92.662] + - - [2304, 4096, 1, 2304] + - [1, 92.161] + - - [576, 4096, 1, 2304] + - [35, 74.344] + - - [2304, 4096, 1, 1728] + - [30, 92.445] + - - [1024, 1024, 48, 96] + - [0, 85.872] + - - [2304, 8192, 1, 12672] + - [15, 93.212] + - - [2304, 8192, 1, 2304] + - [1, 93.063] + - - [576, 8192, 1, 2304] + - [7, 80.458] + - - [2304, 8192, 1, 1728] + - [30, 93.361] + - - [1024, 1024, 16, 96] + - [35, 84.437] + - - [3072, 4096, 1, 6400] + - [1, 90.135] + - - [1536, 4096, 1, 3072] + - [36, 88.547] + - - [3072, 4096, 1, 1536] + - [7, 89.481] + - - [384, 4096, 1, 3072] + - [37, 75.197] + - - [3072, 4096, 1, 1152] + - [1, 89.815] + - - [1024, 1024, 32, 96] + - [0, 85.804] + - - [3072, 8192, 1, 6400] + - [15, 93.117] + - - [1536, 8192, 1, 3072] + - [23, 89.607] + - - [3072, 8192, 1, 1536] + - [1, 92.197] + - - [384, 8192, 1, 3072] + - [22, 82.398] + - - [3072, 8192, 1, 1152] + - [1, 92.662] + - - [2048, 4096, 1, 2048] + - [7, 87.108] + - - [2048, 4096, 1, 4096] + - [7, 87.469] + - - [4096, 4096, 1, 2048] + - [7, 91.457] + - - [1024, 2283, 1, 29000] + - [21, 89.143] + - - [1024, 2296, 1, 29000] + - [21, 89.598] + - - [1024, 2306, 1, 29000] + - [29, 75.17] + - - [1024, 2309, 1, 29000] + - [29, 75.233] + - - [1024, 2318, 1, 29000] + - [0, 75.129] + - - [1024, 2320, 1, 29000] + - [29, 75.698] + - - [1024, 2324, 1, 29000] + - [0, 75.332] + - - [1024, 2325, 1, 29000] + - [29, 75.847] + - - [1024, 2329, 1, 29000] + - [29, 75.847] + - - [1024, 2338, 1, 29000] + - [14, 76.027] + - - [1024, 2345, 1, 29000] + - [29, 76.122] + - - [1024, 2350, 1, 29000] + - [29, 76.438] + - - [1024, 2362, 1, 29000] + - [29, 76.853] + - - [1024, 2366, 1, 29000] + - [29, 76.961] + - - [1024, 2368, 1, 29000] + - [29, 77.051] + - - [1024, 2374, 1, 29000] + - [14, 76.893] + - - [1024, 2390, 1, 29000] + - [0, 77.575] + - - [512, 512, 320, 64] + - [0, 52.99] + - - [512, 512, 80, 64] + - [29, 82.389] + - - [2560, 1024, 1, 2560] + - [29, 82.907] + - - [2560, 1024, 1, 4096] + - [35, 82.862] + - - [4096, 1024, 1, 2560] + - [29, 81.522] + - - [1024, 1024, 512, 64] + - [10, 49.846] + - - [1024, 32768, 1, 3072] + - [23, 92.373] + - - [1024, 32768, 1, 4096] + - [265, 93.511] + - - [1024, 32768, 1, 50304] + - [5, 90.32] + - - [4096, 32768, 1, 1024] + - [280, 93.944] + - - [1024, 1024, 24, 128] + - [7, 87.162] + - - [128, 1024, 24, 1024] + - [10, 78.075] + - - [768, 320, 1, 30522] + - [45, 67.374] + - - [768, 640, 1, 30522] + - [44, 78.26] + - - [768, 1280, 1, 30522] + - [42, 84.775] + - - [1024, 780, 1, 30522] + - [41, 76.226] + - - [1024, 308, 1, 30522] + - [41, 68.529] + - - [1024, 800, 1, 30522] + - [41, 78.057] + - - [1024, 820, 1, 30522] + - [41, 79.921] + - - [1024, 385, 1, 30522] + - [41, 63.629] + - - [1024, 462, 1, 30522] + - [43, 69.747] + - - [1024, 640, 1, 30528] + - [41, 81.197] + - - [2048, 199, 1, 29000] + - [50, 60.286] + - - [2048, 221, 1, 29000] + - [48, 66.449] + - - [2048, 224, 1, 29000] + - [46, 67.297] + - - [2048, 229, 1, 29000] + - [51, 69.138] + - - [2048, 234, 1, 29000] + - [51, 70.505] + - - [2048, 242, 1, 29000] + - [51, 72.878] + - - [2048, 246, 1, 29000] + - [43, 73.64] + - - [2048, 247, 1, 29000] + - [43, 74.421] + - - [2048, 256, 1, 29000] + - [50, 77.033] + - - [2048, 262, 1, 29000] + - [47, 65.623] + - - [2048, 264, 1, 29000] + - [47, 66.155] + - - [2048, 265, 1, 29000] + - [47, 66.399] + - - [2048, 274, 1, 29000] + - [47, 68.456] + - - [2048, 277, 1, 29000] + - [49, 68.953] + - - [2048, 279, 1, 29000] + - [47, 69.9] + - - [2048, 288, 1, 29000] + - [47, 71.795] + - - [2048, 296, 1, 29000] + - [47, 73.821] + - - [2048, 315, 1, 29000] + - [47, 78.572] + - - [2048, 335, 1, 29000] + - [47, 70.338] + - - [1024, 561, 1, 29000] + - [41, 80.552] + - - [1024, 574, 1, 29000] + - [41, 82.343] + - - [1024, 600, 1, 29000] + - [41, 76.392] + - - [1024, 608, 1, 29000] + - [41, 77.299] + - - [1024, 615, 1, 29000] + - [41, 78.057] + - - [1024, 622, 1, 29000] + - [41, 78.991] + - - [1024, 625, 1, 29000] + - [41, 79.257] + - - [1024, 626, 1, 29000] + - [41, 79.456] + - - [1024, 628, 1, 29000] + - [41, 79.763] + - - [1024, 636, 1, 29000] + - [41, 80.661] + - - [1024, 651, 1, 29000] + - [41, 75.657] + - - [1024, 658, 1, 29000] + - [41, 76.686] + - - [1024, 669, 1, 29000] + - [41, 77.746] + - - [1024, 670, 1, 29000] + - [41, 77.886] + - - [1024, 672, 1, 29000] + - [41, 78.157] + - - [1024, 684, 1, 29000] + - [41, 79.361] + - - [1024, 716, 1, 29000] + - [41, 76.217] + - - [1024, 730, 1, 29000] + - [41, 77.692] + - - [1600, 512, 1, 1024] + - [82, 66.164] + - - [1024, 512, 1, 1] + - [93, 0.961] + - - [1024, 512, 1, 64] + - [53, 30.522] + - - [2048, 512, 1, 1] + - [93, 1.548] + - - [768, 640, 1, 768] + - [97, 57.267] + - - [768, 1024, 1, 2] + - [93, 2.752] + - - [768, 1024, 1, 768] + - [97, 66.755] + - - [768, 1280, 1, 768] + - [64, 66.219] + - - [768, 512, 1, 2] + - [93, 1.674] + - - [768, 512, 1, 768] + - [96, 47.242] + - - [1024, 512, 1, 1024] + - [105, 61.229] + - - [1024, 512, 1, 2] + - [93, 2.075] + - - [64, 64, 768, 64] + - [77, 45.013] + - - [64, 64, 96, 64] + - [55, 26.407] + - - [704, 1024, 1, 128] + - [73, 45.952] + - - [1024, 1024, 1, 3328] + - [57, 73.546] + - - [1856, 448, 1, 3328] + - [56, 73.329] + - - [128, 6784, 1, 3328] + - [106, 60.908] + - - [2368, 448, 1, 128] + - [54, 56.103] + - - [256, 4288, 1, 3328] + - [64, 76.582] + - - [704, 1856, 1, 3328] + - [97, 72.481] + - - [448, 1024, 1, 1280] + - [74, 54.335] + - - [256, 1408, 1, 3328] + - [81, 45.577] + - - [704, 1856, 1, 1280] + - [82, 71.159] + - - [128, 5056, 1, 128] + - [97, 42.185] + - - [2368, 128, 1, 256] + - [55, 33.089] + - - [64, 5056, 1, 256] + - [62, 34.479] + - - [256, 2944, 1, 256] + - [63, 57.042] + - - [256, 1856, 1, 1280] + - [105, 56.974] + - - [4288, 256, 1, 256] + - [56, 65.086] + - - [2944, 128, 1, 128] + - [55, 34.217] + - - [5888, 64, 1, 3328] + - [55, 47.387] + - - [2944, 256, 1, 3328] + - [97, 67.166] + - - [1408, 448, 1, 1280] + - [74, 55.395] + - - [1408, 704, 1, 3328] + - [57, 69.999] + - - [1408, 256, 1, 1280] + - [55, 44.765] + - - [3072, 128, 1, 1024] + - [55, 47.883] + - - [6784, 64, 1, 256] + - [74, 43.457] + - - [2944, 256, 1, 256] + - [74, 57.421] + - - [704, 1408, 1, 3328] + - [64, 69.278] + - - [2944, 256, 1, 128] + - [97, 49.236] + - - [2368, 128, 1, 3328] + - [104, 38.715] + - - [64, 193600, 1, 64] + - [65, 61.644] + - - [448, 1408, 1, 256] + - [63, 46.227] + - - [64, 5056, 1, 3328] + - [62, 40.966] + - - [512, 1500, 1, 2816] + - [97, 67.816] + - - [1024, 448, 1, 128] + - [54, 36.694] + - - [256, 3584, 1, 3328] + - [75, 64.215] + - - [256, 1408, 1, 256] + - [104, 37.984] + - - [5056, 64, 1, 1280] + - [55, 40.154] + - - [1024, 704, 1, 256] + - [63, 54.488] + - - [128, 4288, 1, 128] + - [63, 42.834] + - - [3584, 256, 1, 128] + - [63, 50.752] + - - [448, 1024, 1, 256] + - [61, 44.007] + - - [5888, 64, 1, 256] + - [73, 39.784] + - - [1856, 256, 1, 1280] + - [74, 55.544] + - - [64, 5888, 1, 3328] + - [104, 46.683] + - - [448, 1856, 1, 128] + - [54, 52.079] + - - [1024, 704, 1, 1280] + - [56, 62.699] + - - [128, 5888, 1, 256] + - [63, 56.893] + - - [704, 704, 1, 3328] + - [56, 61.057] + - - [704, 1408, 1, 1280] + - [64, 67.554] + - - [3584, 256, 1, 3328] + - [98, 65.28] + - - [704, 1856, 1, 128] + - [54, 58.77] + - - [128, 3584, 1, 3328] + - [82, 56.536] + - - [2944, 448, 1, 128] + - [74, 62.636] + - - [64, 193600, 1, 256] + - [92, 42.266] + - - [128, 2944, 1, 1280] + - [81, 45.564] + - - [448, 2944, 1, 1280] + - [64, 61.914] + - - [3584, 128, 1, 256] + - [56, 45.212] + - - [448, 1408, 1, 3328] + - [56, 56.036] + - - [704, 1024, 1, 256] + - [103, 53.103] + - - [256, 3584, 1, 256] + - [63, 56.306] + - - [256, 2944, 1, 3328] + - [56, 66.99] + - - [448, 2368, 1, 128] + - [54, 56.415] + - - [1408, 704, 1, 256] + - [97, 60.962] + - - [448, 2944, 1, 3328] + - [64, 62.975] + - - [64, 5888, 1, 256] + - [66, 37.456] + - - [512, 1500, 1, 2048] + - [105, 66.819] + - - [6784, 128, 1, 3328] + - [75, 62.045] + - - [704, 704, 1, 256] + - [80, 46.999] + - - [448, 704, 1, 1280] + - [96, 39.202] + - - [1024, 448, 1, 3328] + - [56, 57.114] + - - [2944, 128, 1, 256] + - [55, 40.831] + - - [1024, 1024, 1, 1280] + - [75, 72.039] + - - [448, 1024, 1, 128] + - [54, 36.292] + - - [448, 2368, 1, 3328] + - [58, 64.761] + - - [5056, 64, 1, 128] + - [55, 30.138] + - - [1024, 700, 1, 512] + - [97, 56.744] + - - [128, 6784, 1, 1280] + - [64, 59.699] + - - [1856, 256, 1, 256] + - [54, 45.884] + - - [128, 5888, 1, 1280] + - [82, 65.077] + - - [256, 4288, 1, 1280] + - [64, 74.687] + - - [256, 1856, 1, 128] + - [97, 38.214] + - - [7680, 64, 1, 2560] + - [97, 60.953] + - - [448, 1408, 1, 128] + - [54, 40.655] + - - [6784, 128, 1, 256] + - [74, 55.048] + - - [704, 448, 1, 256] + - [55, 34.298] + - - [704, 448, 1, 128] + - [55, 29.859] + - - [704, 1408, 1, 128] + - [54, 53.202] + - - [4288, 128, 1, 1280] + - [106, 64.166] + - - [128, 2944, 1, 128] + - [55, 34.217] + - - [128, 4288, 1, 256] + - [63, 52.566] + - - [704, 448, 1, 3328] + - [73, 40.091] + - - [448, 2368, 1, 1280] + - [58, 63.909] + - - [64, 6784, 1, 3328] + - [65, 47.725] + - - [2944, 256, 1, 1280] + - [74, 65.646] + - - [256, 2368, 1, 128] + - [63, 40.33] + - - [1856, 704, 1, 256] + - [56, 65.785] + - - [1856, 448, 1, 1280] + - [56, 70.951] + - - [128, 5888, 1, 128] + - [97, 48.041] + - - [1024, 1024, 1, 256] + - [75, 64.973] + - - [704, 1856, 1, 256] + - [82, 64.824] + - - [256, 2368, 1, 1280] + - [56, 52.963] + - - [2944, 448, 1, 256] + - [74, 67.644] + - - [1856, 448, 1, 128] + - [54, 52.769] + - - [2368, 128, 1, 1280] + - [55, 38.002] + - - [64, 6784, 1, 256] + - [61, 38.756] + - - [64, 5056, 1, 1280] + - [62, 39.441] + - - [3025, 64, 64, 64] + - [74, 72.571] + - - [2368, 256, 1, 1280] + - [74, 52.052] + - - [2368, 448, 1, 1280] + - [74, 70.193] + - - [128, 3584, 1, 256] + - [63, 44.675] + - - [704, 448, 1, 1280] + - [55, 39.356] + - - [4288, 256, 1, 1280] + - [98, 74.953] + - - [4288, 128, 1, 3328] + - [56, 67.671] + - - [7680, 128, 1, 2560] + - [75, 69.386] + - - [1408, 256, 1, 128] + - [55, 33.576] + - - [256, 1408, 1, 1280] + - [81, 44.251] + - - [6784, 64, 1, 3328] + - [74, 54.362] + - - [128, 2944, 1, 3328] + - [104, 47.739] + - - [2944, 448, 1, 3328] + - [74, 73.487] + - - [5888, 128, 1, 256] + - [56, 56.672] + - - [5056, 64, 1, 256] + - [55, 35.124] + - - [512, 1500, 1, 1536] + - [82, 65.984] + - - [128, 3584, 1, 1280] + - [63, 54.669] + - - [1024, 704, 1, 128] + - [56, 47.093] + - - [128, 5056, 1, 3328] + - [82, 57.642] + - - [1024, 1024, 1, 128] + - [56, 56.911] + - - [4288, 128, 1, 256] + - [54, 52.304] + - - [1408, 448, 1, 128] + - [74, 42.063] + - - [3584, 256, 1, 256] + - [74, 56.852] + - - [128, 2944, 1, 256] + - [104, 39.712] + - - [128, 6784, 1, 128] + - [63, 48.127] + - - [448, 1856, 1, 256] + - [80, 59.063] + - - [3584, 128, 1, 3328] + - [57, 57.258] + - - [5888, 128, 1, 3328] + - [56, 66.927] + - - [1408, 704, 1, 1280] + - [98, 68.307] + - - [448, 2944, 1, 256] + - [103, 62.023] + - - [448, 2368, 1, 256] + - [103, 60.66] + - - [64, 6784, 1, 1280] + - [103, 46.778] + - - [128, 2368, 1, 3328] + - [104, 38.679] + - - [5056, 64, 1, 3328] + - [96, 41.03] + - - [64, 5888, 1, 128] + - [62, 34.005] + - - [5056, 128, 1, 3328] + - [56, 57.209] + - - [448, 704, 1, 256] + - [96, 33.982] + - - [2944, 128, 1, 3328] + - [75, 47.666] + - - [128, 5056, 1, 1280] + - [63, 56.27] + - - [704, 704, 1, 128] + - [61, 38.778] + - - [2368, 128, 1, 128] + - [55, 28.884] + - - [5056, 128, 1, 128] + - [55, 41.801] + - - [448, 1024, 1, 3328] + - [82, 56.469] + - - [2368, 256, 1, 256] + - [54, 46.187] + - - [256, 2368, 1, 3328] + - [56, 54.109] + - - [256, 3584, 1, 128] + - [97, 51.443] + - - [4288, 256, 1, 128] + - [54, 57.949] + - - [448, 1856, 1, 3328] + - [84, 64.928] + - - [2368, 256, 1, 128] + - [73, 39.423] + - - [256, 1856, 1, 256] + - [63, 46.196] + - - [256, 2944, 1, 128] + - [97, 48.903] + - - [1408, 256, 1, 3328] + - [73, 45.74] + - - [2368, 448, 1, 256] + - [74, 61.571] + - - [4288, 256, 1, 3328] + - [75, 77.615] + - - [1856, 704, 1, 128] + - [95, 59.604] + - - [4288, 128, 1, 128] + - [54, 42.491] + - - [6784, 64, 1, 1280] + - [74, 52.877] + - - [3584, 128, 1, 128] + - [54, 36.595] + - - [256, 2368, 1, 256] + - [63, 46.552] + - - [2944, 448, 1, 1280] + - [97, 72.72] + - - [448, 1408, 1, 1280] + - [56, 54.583] + - - [448, 1856, 1, 1280] + - [61, 64.802] + - - [1856, 256, 1, 128] + - [54, 37.483] + - - [128, 2368, 1, 256] + - [81, 32.597] + - - [5888, 64, 1, 1280] + - [55, 46.421] + - - [1024, 448, 1, 1280] + - [56, 55.377] + - - [128, 5056, 1, 256] + - [63, 49.304] + - - [1856, 704, 1, 1280] + - [97, 71.403] + - - [448, 2944, 1, 128] + - [54, 59.135] + - - [1408, 256, 1, 256] + - [55, 38.765] + - - [2368, 448, 1, 3328] + - [97, 71.863] + - - [128, 5888, 1, 3328] + - [63, 66.503] + - - [64, 5056, 1, 128] + - [104, 29.57] + - - [64, 6784, 1, 128] + - [62, 33.432] + - - [448, 704, 1, 128] + - [55, 29.759] + - - [1408, 448, 1, 256] + - [56, 48.7] + - - [1408, 704, 1, 128] + - [74, 55.147] + - - [2368, 256, 1, 3328] + - [97, 53.884] + - - [5888, 128, 1, 1280] + - [56, 65.452] + - - [256, 3584, 1, 1280] + - [64, 62.957] + - - [256, 1408, 1, 128] + - [55, 32.426] + - - [256, 4288, 1, 128] + - [97, 59.248] + - - [5888, 128, 1, 128] + - [56, 48.253] + - - [1408, 448, 1, 3328] + - [97, 56.523] + - - [704, 1024, 1, 1280] + - [56, 62.361] + - - [1856, 256, 1, 3328] + - [56, 58.738] + - - [64, 5888, 1, 1280] + - [109, 43.786] + - - [6784, 64, 1, 128] + - [74, 34.921] + - - [704, 704, 1, 1280] + - [56, 58.458] + - - [128, 2368, 1, 1280] + - [104, 37.772] + - - [3584, 256, 1, 1280] + - [75, 63.854] + - - [128, 4288, 1, 3328] + - [105, 67.807] + - - [3584, 128, 1, 1280] + - [56, 55.354] + - - [5056, 128, 1, 1280] + - [56, 56.424] + - - [256, 4288, 1, 256] + - [63, 66.589] + - - [1024, 448, 1, 256] + - [56, 45.365] + - - [2944, 128, 1, 1280] + - [55, 46.579] + - - [128, 2368, 1, 128] + - [55, 28.51] + - - [256, 2944, 1, 1280] + - [63, 65.352] + - - [2560, 128, 1, 2560] + - [104, 41.309] + - - [704, 1024, 1, 3328] + - [56, 64.012] + - - [128, 6784, 1, 256] + - [63, 53.748] + - - [256, 1856, 1, 3328] + - [56, 59.275] + - - [6784, 128, 1, 128] + - [97, 49.552] + - - [128, 3584, 1, 128] + - [74, 36.392] + - - [704, 1408, 1, 256] + - [64, 59.135] + - - [4096, 128, 1, 4096] + - [60, 64.919] + - - [5888, 64, 1, 128] + - [62, 33.175] + - - [5056, 128, 1, 256] + - [54, 48.474] + - - [6784, 128, 1, 1280] + - [75, 60.814] + - - [1856, 448, 1, 256] + - [74, 59.649] + - - [1024, 704, 1, 3328] + - [56, 64.206] + - - [128, 4288, 1, 1280] + - [63, 65.005] + - - [448, 704, 1, 3328] + - [96, 40.01] + - - [1856, 704, 1, 3328] + - [74, 72.621] + - - [512, 1500, 1, 2560] + - [63, 67.008] + - - [3136, 64, 128, 64] + - [52, 40.912] + - - [3136, 64, 128, 256] + - [66, 45.884] + - - [3136, 64, 256, 64] + - [52, 27.986] + - - [3136, 64, 256, 256] + - [86, 46.011] + - - [1024, 512, 1, 2048] + - [82, 62.122] + - - [4096, 256, 1, 2048] + - [64, 73.311] + - - [2048, 256, 1, 4096] + - [79, 64.946] + - - [512, 768, 1, 2048] + - [104, 48.903] + - - [2048, 256, 1, 1024] + - [74, 61.477] + - - [2048, 200, 1, 512] + - [97, 44.129] + - - [4096, 200, 1, 1024] + - [98, 54.835] + - - [2048, 200, 1, 4096] + - [74, 50.843] + - - [2048, 512, 1, 1024] + - [106, 71.263] + - - [1024, 1024, 1, 512] + - [75, 68.307] + - - [2048, 512, 1, 4096] + - [57, 73.361] + - - [1024, 1024, 1, 4096] + - [274, 74.284] + - - [4096, 200, 1, 2048] + - [75, 56.681] + - - [2048, 200, 1, 1024] + - [97, 47.933] + - - [1024, 768, 1, 512] + - [97, 64.012] + - - [2048, 200, 1, 2048] + - [97, 50.008] + - - [2048, 256, 1, 2048] + - [97, 63.742] + - - [512, 768, 1, 512] + - [104, 45.099] + - - [4096, 256, 1, 4096] + - [106, 73.794] + - - [1024, 512, 1, 512] + - [63, 57.615] + - - [1024, 1024, 1, 2048] + - [64, 72.49] + - - [4096, 256, 1, 1024] + - [64, 71.533] + - - [512, 768, 1, 1024] + - [81, 47.707] + - - [1024, 512, 1, 4096] + - [79, 65.095] + - - [4096, 200, 1, 4096] + - [75, 57.89] + - - [2048, 256, 1, 512] + - [56, 58.395] + - - [1024, 1024, 1, 1024] + - [64, 70.64] + - - [4096, 192, 1, 2048] + - [82, 69.007] + - - [5329, 64, 64, 160] + - [52, 38.521] + - - [1225, 64, 64, 384] + - [78, 72.855] + - - [4096, 320, 1, 1280] + - [74, 71.073] + - - [4096, 192, 1, 1280] + - [97, 68.028] + - - [1225, 96, 64, 384] + - [60, 58.179] + - - [4096, 320, 1, 2048] + - [74, 71.926] + - - [4096, 256, 1, 1536] + - [75, 72.752] + - - [64, 147, 432, 148] + - [76, 46.741] + - - [64, 123, 528, 123] + - [99, 51.032] + - - [64, 111, 576, 112] + - [76, 51.461] + - - [64, 77, 816, 77] + - [54, 37.655] + - - [64, 92, 688, 92] + - [76, 41.413] + - - [64, 159, 400, 159] + - [76, 51.37] + - - [64, 85, 752, 84] + - [54, 40.123] + - - [64, 122, 528, 123] + - [76, 51.226] + - - [64, 93, 688, 92] + - [61, 42.879] + - - [64, 102, 624, 99] + - [58, 44.206] + - - [64, 133, 480, 133] + - [76, 44.319] + - - [64, 232, 272, 232] + - [58, 56.866] + - - [64, 162, 400, 159] + - [58, 52.544] + - - [64, 78, 816, 78] + - [72, 39.04] + - - [64, 99, 624, 99] + - [99, 43.777] + - - [64, 101, 624, 102] + - [76, 44.662] + - - [64, 111, 576, 111] + - [58, 48.036] + - - [64, 134, 480, 134] + - [72, 52.065] + - - [64, 135, 480, 132] + - [99, 44.296] + - - [64, 134, 480, 132] + - [58, 44.206] + - - [64, 134, 480, 135] + - [99, 44.251] + - - [64, 162, 400, 162] + - [99, 52.3] + - - [64, 102, 624, 102] + - [76, 44.675] + - - [64, 135, 480, 133] + - [99, 44.107] + - - [64, 148, 432, 143] + - [99, 48.636] + - - [64, 100, 624, 100] + - [80, 45.045] + - - [64, 65, 992, 65] + - [95, 34.957] + - - [64, 122, 528, 122] + - [76, 50.653] + - - [64, 228, 272, 228] + - [99, 55.99] + - - [64, 112, 576, 111] + - [58, 48.88] + - - [64, 143, 432, 143] + - [76, 47.22] + - - [64, 135, 480, 135] + - [58, 44.301] + - - [64, 232, 272, 228] + - [99, 56.753] + - - [64, 193, 320, 193] + - [58, 48.203] + - - [64, 71, 896, 71] + - [54, 36.288] + - - [64, 84, 752, 84] + - [72, 39.92] + - - [64, 132, 480, 132] + - [54, 48.145] + - - [64, 85, 752, 85] + - [58, 37.461] + - - [64, 102, 624, 100] + - [58, 45.65] + - - [64, 78, 816, 77] + - [95, 37.438] + - - [64, 112, 576, 112] + - [76, 51.312] + - - [64, 148, 432, 148] + - [76, 48.772] + - - [64, 159, 400, 160] + - [58, 53.211] + - - [64, 102, 624, 101] + - [76, 44.792] + - - [64, 101, 624, 101] + - [58, 44.098] + - - [64, 160, 400, 160] + - [95, 61.729] + - - [64, 93, 688, 93] + - [58, 41.165] + - - [64, 147, 432, 147] + - [58, 47.274] + - - [64, 100, 624, 102] + - [76, 44.63] + - - [64, 177, 352, 177] + - [76, 55.359] + - - [500, 1024, 1, 512] + - [105, 53.807] + - - [512, 1024, 1, 512] + - [105, 58.228] + - - [200, 2048, 1, 512] + - [82, 43.322] + - - [512, 2000, 1, 1024] + - [64, 68.962] + - - [512, 2048, 1, 512] + - [83, 67.965] + - - [200, 2000, 1, 100] + - [94, 27.679] + - - [200, 2000, 1, 1024] + - [82, 46.177] + - - [500, 1024, 1, 2048] + - [82, 61.12] + - - [512, 2048, 1, 100] + - [97, 52.449] + - - [512, 2048, 1, 2000] + - [75, 74.164] + - - [200, 2000, 1, 10] + - [93, 6.939] + - - [500, 2048, 1, 1024] + - [83, 67.698] + - - [500, 2000, 1, 10] + - [100, 11.167] + - - [500, 2048, 1, 100] + - [97, 46.854] + - - [512, 1024, 1, 500] + - [97, 57.922] + - - [200, 2000, 1, 2000] + - [75, 49.146] + - - [500, 2048, 1, 2000] + - [98, 71.43] + - - [512, 2048, 1, 1024] + - [64, 70.888] + - - [512, 1024, 1, 100] + - [97, 36.392] + - - [256, 2000, 1, 10] + - [93, 9.023] + - - [512, 2000, 1, 100] + - [97, 49.042] + - - [512, 2000, 1, 2048] + - [64, 70.735] + - - [500, 1024, 1, 500] + - [56, 54.948] + - - [256, 2000, 1, 100] + - [96, 33.576] + - - [512, 1024, 1, 2048] + - [82, 63.859] + - - [500, 2048, 1, 2048] + - [106, 69.85] + - - [200, 2048, 1, 10] + - [93, 7.219] + - - [500, 2000, 1, 512] + - [105, 62.772] + - - [500, 1024, 1, 1024] + - [105, 58.58] + - - [200, 2000, 1, 500] + - [56, 43.01] + - - [256, 2048, 1, 100] + - [97, 36.17] + - - [500, 2000, 1, 1024] + - [83, 66.115] + - - [256, 2048, 1, 1024] + - [105, 61.508] + - - [200, 2048, 1, 1024] + - [82, 47.12] + - - [512, 2048, 1, 500] + - [75, 69.693] + - - [512, 2000, 1, 10] + - [93, 12.904] + - - [500, 1024, 1, 2000] + - [75, 62.808] + - - [512, 2000, 1, 512] + - [64, 65.452] + - - [500, 2000, 1, 2000] + - [75, 69.918] + - - [500, 1024, 1, 10] + - [93, 8.135] + - - [256, 2048, 1, 10] + - [93, 9.389] + - - [256, 2048, 1, 500] + - [97, 57.976] + - - [256, 2048, 1, 2048] + - [105, 63.841] + - - [256, 2000, 1, 512] + - [64, 52.99] + - - [512, 1024, 1, 2000] + - [98, 65.524] + - - [256, 2000, 1, 2000] + - [57, 63.065] + - - [256, 2048, 1, 2000] + - [75, 65.488] + - - [200, 2048, 1, 100] + - [55, 28.433] + - - [200, 2000, 1, 2048] + - [63, 48.05] + - - [500, 2048, 1, 512] + - [82, 64.491] + - - [500, 2000, 1, 500] + - [98, 63.976] + - - [200, 2048, 1, 2048] + - [82, 49.345] + - - [200, 2048, 1, 500] + - [97, 44.039] + - - [512, 2000, 1, 500] + - [98, 66.038] + - - [200, 2048, 1, 2000] + - [75, 50.274] + - - [500, 1024, 1, 100] + - [53, 33.188] + - - [512, 1024, 1, 10] + - [93, 9.096] + - - [512, 1024, 1, 1024] + - [105, 61.793] + - - [500, 2048, 1, 10] + - [111, 11.379] + - - [200, 2000, 1, 512] + - [105, 42.27] + - - [256, 2000, 1, 500] + - [97, 54.479] + - - [256, 2048, 1, 512] + - [63, 57.782] + - - [256, 2000, 1, 2048] + - [82, 61.486] + - - [500, 2048, 1, 500] + - [75, 65.736] + - - [256, 2000, 1, 1024] + - [105, 58.436] + - - [500, 2000, 1, 2048] + - [83, 68.24] + - - [512, 2000, 1, 2000] + - [64, 71.597] + - - [512, 2048, 1, 2048] + - [64, 72.391] + - - [512, 2048, 1, 10] + - [93, 13.364] + - - [500, 2000, 1, 100] + - [95, 44.761] + - - [1024, 1131, 1, 1024] + - [106, 75.896] + - - [1024, 1102, 1, 1024] + - [83, 75.188] + - - [1024, 774, 1, 1024] + - [63, 66.575] + - - [4096, 128, 1, 2048] + - [56, 63.593] + - - [4096, 128, 1, 3072] + - [56, 64.328] + - - [1024, 1120, 1, 1024] + - [83, 76.149] + - - [1024, 1015, 1, 1024] + - [83, 69.07] + - - [1024, 992, 1, 1024] + - [106, 67.662] + - - [1024, 950, 1, 1024] + - [106, 65.402] + - - [1024, 1088, 1, 1024] + - [83, 74.466] + - - [64, 128, 96, 128] + - [107, 42.925] + - - [768, 1024, 1, 3072] + - [274, 70.441] + - - [768, 512, 1, 3072] + - [272, 55.084] + - - [64, 256, 192, 256] + - [65, 59.541] + - - [64, 128, 384, 128] + - [107, 52.656] + - - [64, 256, 96, 256] + - [65, 52.719] + - - [6272, 112, 1, 512] + - [97, 56.383] + - - [2048, 320, 1, 1280] + - [97, 57.155] + - - [5329, 64, 1, 448] + - [102, 38.214] + - - [784, 64, 32, 192] + - [54, 63.57] + - - [6272, 64, 1, 480] + - [55, 46.787] + - - [6272, 64, 1, 512] + - [55, 46.972] + - - [6272, 160, 1, 528] + - [74, 54.479] + - - [289, 160, 32, 768] + - [82, 51.641] + - - [5329, 64, 32, 160] + - [82, 55.562] + - - [5329, 96, 1, 576] + - [97, 40.533] + - - [1225, 64, 32, 288] + - [56, 74.976] + - - [289, 192, 32, 768] + - [82, 61.77] + - - [2048, 448, 1, 1280] + - [75, 63.092] + - - [3136, 64, 32, 64] + - [54, 71.109] + - - [6272, 128, 1, 528] + - [74, 66.498] + - - [6272, 96, 1, 480] + - [97, 48.582] + - - [2048, 448, 1, 2048] + - [75, 63.399] + - - [784, 96, 32, 192] + - [74, 51.213] + - - [1001, 512, 1, 4096] + - [101, 63.101] + - - [2048, 192, 1, 1280] + - [55, 48.42] + - - [1225, 64, 32, 256] + - [78, 69.323] + - - [2048, 256, 1, 1536] + - [56, 63.345] + - - [6272, 128, 1, 512] + - [74, 65.858] + - - [1568, 384, 1, 832] + - [74, 50.928] + - - [1568, 256, 1, 832] + - [55, 48.262] + - - [1568, 192, 1, 832] + - [71, 37.172] + - - [289, 192, 32, 1024] + - [63, 61.707] + - - [1225, 64, 32, 384] + - [56, 75.905] + - - [2048, 320, 1, 2048] + - [63, 57.615] + - - [2048, 384, 1, 1536] + - [74, 68.244] + - - [5041, 96, 1, 576] + - [97, 40.231] + - - [6272, 192, 1, 480] + - [97, 65.199] + - - [5041, 192, 1, 720] + - [97, 63.033] + - - [289, 128, 32, 768] + - [106, 54.763] + - - [12544, 64, 1, 147] + - [97, 52.201] + - - [6272, 160, 1, 512] + - [97, 54.366] + - - [1225, 64, 32, 192] + - [105, 71.867] + - - [784, 64, 32, 256] + - [105, 59.176] + - - [6272, 144, 1, 512] + - [56, 48.497] + - - [8192, 192, 1, 1280] + - [97, 73.514] + - - [8192, 192, 1, 2048] + - [56, 73.803] + - - [65, 6400, 1, 1024] + - [63, 34.794] + - - [512, 1290, 1, 2048] + - [105, 57.714] + - - [512, 2205, 1, 2048] + - [64, 77.615] + - - [64, 512, 16, 512] + - [107, 50.17] + - - [512, 600, 1, 2048] + - [104, 38.305] + - - [512, 644, 1, 512] + - [81, 37.632] + - - [512, 644, 1, 2048] + - [81, 41.025] + - - [512, 668, 1, 2048] + - [81, 42.636] + - - [512, 714, 1, 512] + - [104, 41.395] + - - [512, 714, 1, 2048] + - [104, 45.474] + - - [512, 720, 1, 512] + - [81, 41.783] + - - [512, 720, 1, 2048] + - [104, 45.821] + - - [512, 722, 1, 2048] + - [104, 45.961] + - - [512, 781, 1, 512] + - [104, 45.104] + - - [512, 781, 1, 2048] + - [81, 49.769] + - - [512, 848, 1, 2048] + - [82, 52.58] + - - [512, 872, 1, 2048] + - [82, 54.136] + - - [512, 936, 1, 512] + - [105, 51.253] + - - [512, 936, 1, 2048] + - [82, 57.912] + - - [512, 980, 1, 512] + - [64, 52.07] + - - [512, 980, 1, 2048] + - [105, 60.556] + - - [512, 1139, 1, 2048] + - [82, 69.783] + - - [512, 1184, 1, 2048] + - [82, 52.814] + - - [512, 1186, 1, 2048] + - [82, 53.144] + - - [512, 1232, 1, 512] + - [105, 50.734] + - - [512, 1232, 1, 2048] + - [82, 55.476] + - - [512, 1279, 1, 2048] + - [82, 57.376] + - - [512, 1290, 1, 512] + - [105, 52.756] + - - [512, 1327, 1, 2048] + - [82, 59.338] + - - [512, 1331, 1, 2048] + - [105, 59.18] + - - [512, 1341, 1, 2048] + - [82, 59.974] + - - [512, 1350, 1, 512] + - [105, 55.481] + - - [512, 1350, 1, 2048] + - [82, 60.642] + - - [512, 1359, 1, 2048] + - [105, 60.629] + - - [512, 1391, 1, 2048] + - [82, 62.185] + - - [512, 1424, 1, 512] + - [105, 58.116] + - - [512, 1424, 1, 2048] + - [63, 63.624] + - - [512, 1458, 1, 512] + - [105, 59.505] + - - [512, 1458, 1, 2048] + - [105, 64.987] + - - [512, 1462, 1, 512] + - [105, 59.379] + - - [512, 1462, 1, 2048] + - [105, 65.055] + - - [512, 1467, 1, 2048] + - [82, 65.131] + - - [512, 1472, 1, 2048] + - [105, 65.835] + - - [512, 1520, 1, 512] + - [105, 61.734] + - - [512, 1520, 1, 2048] + - [82, 67.595] + - - [512, 1596, 1, 512] + - [63, 63.814] + - - [512, 1596, 1, 2048] + - [82, 70.685] + - - [512, 1599, 1, 512] + - [63, 63.421] + - - [512, 1599, 1, 2048] + - [63, 70.775] + - - [512, 1615, 1, 512] + - [63, 64.748] + - - [512, 1615, 1, 2048] + - [105, 71.727] + - - [512, 1680, 1, 512] + - [83, 55.688] + - - [512, 1680, 1, 2048] + - [106, 59.839] + - - [512, 1709, 1, 2048] + - [64, 60.899] + - - [512, 1890, 1, 512] + - [83, 61.919] + - - [512, 1902, 1, 2048] + - [83, 67.342] + - - [512, 1917, 1, 512] + - [83, 62.253] + - - [512, 1917, 1, 2048] + - [106, 67.518] + - - [512, 2076, 1, 2048] + - [106, 73.27] + - - [512, 2195, 1, 2048] + - [83, 77.218] + - - [512, 2205, 1, 512] + - [106, 71.763] + - - [2048, 198, 1, 512] + - [74, 43.692] + - - [2048, 207, 1, 512] + - [74, 45.677] + - - [2048, 208, 1, 512] + - [74, 45.767] + - - [2048, 245, 1, 512] + - [82, 51.835] + - - [2048, 246, 1, 512] + - [74, 52.282] + - - [2048, 264, 1, 512] + - [56, 43.547] + - - [2048, 401, 1, 512] + - [75, 55.052] + - - [2048, 439, 1, 512] + - [98, 58.567] + - - [2048, 443, 1, 512] + - [98, 58.905] + - - [2048, 446, 1, 512] + - [75, 59.104] + - - [2048, 465, 1, 512] + - [97, 61.585] + - - [2048, 468, 1, 512] + - [97, 61.639] + - - [2048, 493, 1, 512] + - [74, 63.724] + - - [2048, 495, 1, 512] + - [97, 64.725] + - - [2048, 511, 1, 512] + - [97, 66.413] + - - [2048, 512, 1, 512] + - [75, 69.327] + - - [2048, 540, 1, 512] + - [63, 69.156] + - - [2048, 550, 1, 512] + - [63, 70.135] + - - [2048, 560, 1, 512] + - [63, 71.218] + - - [2048, 600, 1, 512] + - [97, 64.364] + - - [64, 64, 496, 64] + - [77, 40.29] + - - [64, 65, 496, 64] + - [54, 32.521] + - - [64, 65, 496, 65] + - [95, 31.483] + - - [64, 70, 216, 70] + - [54, 30.377] + - - [64, 71, 216, 71] + - [54, 30.702] + - - [64, 78, 248, 77] + - [95, 33.973] + - - [64, 80, 152, 80] + - [72, 29.507] + - - [64, 93, 344, 93] + - [55, 36.464] + - - [64, 102, 312, 102] + - [55, 39.451] + - - [64, 122, 264, 122] + - [77, 43.84] + - - [64, 122, 264, 123] + - [59, 43.998] + - - [64, 123, 264, 123] + - [77, 44.562] + - - [64, 512, 96, 512] + - [107, 62.785] + - - [64, 512, 128, 512] + - [69, 48.984] + - - [64, 128, 512, 128] + - [107, 51.037] + - - [64, 512, 64, 512] + - [65, 56.293] + - - [2048, 512, 1, 2048] + - [64, 72.53] + - - [512, 1600, 1, 32] + - [93, 27.124] + - - [512, 1600, 1, 512] + - [63, 66.963] + - - [560, 1600, 1, 1024] + - [64, 60.132] + - - [1024, 512, 1, 3072] + - [79, 64.373] + - - [64, 192, 64, 1280] + - [65, 60.85] + - - [64, 320, 64, 1280] + - [65, 57.213] + - - [64, 384, 64, 1280] + - [107, 49.458] + - - [64, 448, 64, 1280] + - [90, 46.457] + - - [64, 192, 64, 2048] + - [107, 57.863] + - - [64, 320, 64, 2048] + - [108, 39.5] + - - [64, 384, 64, 2048] + - [70, 39.013] + - - [64, 448, 64, 2048] + - [91, 38.165] + - - [1225, 64, 64, 192] + - [74, 76.902] + - - [1225, 64, 64, 256] + - [63, 77.065] + - - [1225, 64, 64, 288] + - [97, 79.122] + - - [5329, 80, 64, 64] + - [52, 34.749] + - - [3136, 64, 64, 64] + - [74, 74.989] + - - [3136, 64, 64, 256] + - [66, 45.821] + - - [64, 192, 32, 1280] + - [81, 45.289] + - - [64, 320, 32, 1280] + - [65, 51.438] + - - [64, 384, 32, 1280] + - [65, 60.782] + - - [64, 448, 32, 1280] + - [107, 56.203] + - - [64, 192, 32, 2048] + - [62, 46.308] + - - [64, 320, 32, 2048] + - [84, 52.264] + - - [64, 384, 32, 2048] + - [65, 60.525] + - - [64, 448, 32, 2048] + - [65, 55.702] + - - [5329, 80, 32, 64] + - [97, 48.645] + - - [3136, 64, 32, 256] + - [74, 79.934] + - - [196, 256, 32, 1024] + - [105, 56.207] + - - [256, 4096, 1, 4] + - [93, 6.186] + - - [960, 1024, 1, 1024] + - [83, 65.479] + - - [768, 768, 1, 768] + - [97, 67.807] + - - [768, 768, 1, 384] + - [97, 62.69] + - - [100, 128, 120, 512] + - [105, 58.147] + - - [100, 128, 139, 512] + - [105, 59.257] + - - [100, 128, 160, 512] + - [67, 59.388] + - - [22500, 64, 1, 147] + - [82, 62.014] + - - [1024, 960, 1, 1024] + - [83, 65.686] + - - [1024, 616, 1, 1024] + - [63, 53.162] + - - [64, 128, 128, 128] + - [107, 47.089] + - - [64, 128, 160, 128] + - [68, 44.265] + - - [1024, 1024, 1, 2] + - [61, 3.072] + - - [64, 128, 624, 128] + - [65, 58.828] + - - [1024, 780, 1, 1024] + - [63, 66.972] + - - [64, 128, 640, 128] + - [107, 57.827] + - - [1024, 800, 1, 1024] + - [105, 68.438] + - - [64, 128, 656, 128] + - [65, 58.337] + - - [1024, 820, 1, 1024] + - [82, 70.049] + - - [64, 512, 80, 512] + - [84, 61.057] + - - [1024, 385, 1, 1024] + - [97, 45.762] + - - [1024, 462, 1, 1024] + - [97, 54.835] + - - [64, 128, 144, 128] + - [107, 50.315] + - - [1024, 960, 1, 64] + - [54, 44.075] + - - [64, 512, 256, 512] + - [70, 44.31] + - - [64, 512, 40, 512] + - [84, 52.237] + - - [96, 1024, 64, 1024] + - [110, 47.662] + - - [96, 1024, 128, 1024] + - [89, 48.767] + - - [64, 1024, 256, 1024] + - [112, 44.043] + - - [64, 1024, 32, 1024] + - [107, 52.092] + - - [64, 1024, 64, 1024] + - [112, 44.138] + - - [64, 1024, 128, 1024] + - [112, 43.886] + - - [64, 128, 1024, 128] + - [107, 58.607] + - - [1024, 864, 1, 1024] + - [83, 58.328] + - - [1024, 864, 1, 512] + - [63, 56.302] + - - [256, 3456, 1, 128] + - [105, 49.507] + - - [256, 4096, 1, 128] + - [97, 56.595] + - - [480, 864, 1, 1024] + - [82, 47.996] + - - [512, 864, 1, 256] + - [103, 38.706] + - - [64, 128, 1280, 128] + - [108, 32.647] + - - [64, 128, 1312, 128] + - [85, 31.74] + - - [64, 512, 192, 512] + - [112, 43.664] + - - [256, 4096, 1, 1] + - [93, 1.462] + - - [64, 128, 2048, 128] + - [219, 31.122] + - - [64, 128, 1536, 128] + - [218, 31.501] + - - [64, 128, 192, 128] + - [88, 43.01] + - - [64, 384, 144, 384] + - [107, 63.561] + - - [64, 512, 48, 512] + - [65, 55.95] + - - [64, 128, 256, 128] + - [109, 48.56] + - - [64, 384, 192, 384] + - [68, 54.326] + - - [950, 512, 2, 2048] + - [64, 66.092] + - - [3400, 256, 1, 1024] + - [75, 58.738] + - - [3800, 256, 1, 1024] + - [57, 65.267] + - - [850, 512, 2, 2048] + - [64, 59.902] + - - [805, 512, 2, 2048] + - [65, 63.345] + - - [864, 512, 2, 2048] + - [64, 60.962] + - - [950, 256, 2, 2048] + - [63, 57.709] + - - [888, 512, 2, 2048] + - [106, 62.393] + - - [51520, 64, 2, 256] + - [56, 80.94] + - - [46464, 64, 2, 256] + - [97, 81.739] + - - [49152, 64, 2, 256] + - [97, 82.276] + - - [1900, 512, 1, 1024] + - [83, 64.761] + - - [1700, 512, 1, 1024] + - [83, 58.499] + - - [1610, 512, 1, 1024] + - [105, 67.96] + - - [1536, 512, 1, 1024] + - [82, 66.814] + - - [1728, 512, 1, 1024] + - [106, 59.514] + - - [1024, 1024, 1, 320] + - [75, 66.868] + - - [51520, 64, 2, 64] + - [103, 71.745] + - - [55296, 64, 2, 64] + - [97, 74.371] + - - [49152, 64, 2, 64] + - [56, 72.436] + - - [54400, 64, 2, 64] + - [63, 74.525] + - - [42240, 64, 2, 256] + - [97, 81.739] + - - [672, 512, 2, 2048] + - [82, 59.893] + - - [54400, 64, 2, 256] + - [87, 78.847] + - - [56832, 64, 2, 256] + - [67, 76.704] + - - [55296, 64, 2, 256] + - [110, 77.71] + - - [60800, 64, 2, 64] + - [56, 70.798] + - - [660, 512, 2, 2048] + - [82, 58.616] + - - [768, 512, 2, 2048] + - [82, 68.565] + - - [43008, 64, 2, 256] + - [97, 82.921] + - - [864, 256, 2, 2048] + - [63, 52.877] + - - [726, 512, 2, 2048] + - [82, 64.405] + - - [768, 256, 2, 2048] + - [104, 48.821] + - - [45632, 64, 2, 256] + - [63, 82.032] + - - [713, 512, 2, 2048] + - [82, 63.119] + - - [805, 256, 2, 2048] + - [81, 50.942] + - - [60800, 64, 2, 256] + - [67, 67.423] + - - [850, 256, 2, 2048] + - [82, 50.797] + - - [1024, 1024, 1, 81] + - [56, 48.262] + - - [96, 1024, 160, 1024] + - [87, 49.814] + - - [96, 1024, 40, 1024] + - [67, 51.023] + - - [96, 1024, 80, 1024] + - [87, 47.788] + - - [96, 1024, 96, 1024] + - [67, 48.632] + - - [96, 1024, 24, 1024] + - [63, 60.186] + - - [96, 1024, 48, 1024] + - [67, 49.097] + - - [96, 1024, 16, 1024] + - [82, 55.142] + - - [96, 1024, 32, 1024] + - [110, 57.236] + - - [64, 512, 320, 512] + - [70, 44.634] + - - [64, 1024, 512, 1024] + - [70, 44.332] + - - [1024, 80, 1, 30522] + - [113, 38.859] + - - [1024, 120, 1, 30522] + - [113, 57.732] + - - [1024, 77, 1, 30522] + - [113, 37.465] + - - [1024, 200, 1, 30522] + - [119, 55.882] + - - [1024, 160, 1, 30522] + - [113, 55.512] + - - [1024, 180, 1, 30522] + - [113, 62.713] + - - [1024, 160, 1, 30528] + - [115, 55.751] + - - [1024, 240, 1, 30528] + - [114, 66.327] + - - [2560, 109, 1, 29000] + - [118, 59.149] + - - [2560, 121, 1, 29000] + - [116, 65.591] + - - [2560, 65, 1, 29000] + - [115, 35.462] + - - [2560, 66, 1, 29000] + - [118, 35.972] + - - [2560, 67, 1, 29000] + - [118, 36.581] + - - [2560, 69, 1, 29000] + - [116, 37.655] + - - [2560, 70, 1, 29000] + - [116, 38.174] + - - [2560, 71, 1, 29000] + - [116, 38.652] + - - [2560, 73, 1, 29000] + - [116, 39.762] + - - [2560, 74, 1, 29000] + - [118, 40.199] + - - [2560, 75, 1, 29000] + - [118, 40.885] + - - [2560, 77, 1, 29000] + - [118, 41.959] + - - [2560, 78, 1, 29000] + - [118, 42.532] + - - [2560, 80, 1, 29000] + - [118, 43.592] + - - [2560, 81, 1, 29000] + - [118, 44.066] + - - [2560, 82, 1, 29000] + - [117, 44.472] + - - [2560, 83, 1, 29000] + - [118, 45.262] + - - [2560, 84, 1, 29000] + - [118, 45.735] + - - [2560, 88, 1, 29000] + - [118, 47.847] + - - [2560, 89, 1, 29000] + - [116, 48.442] + - - [2560, 90, 1, 29000] + - [118, 48.961] + - - [2560, 92, 1, 29000] + - [116, 50.058] + - - [2560, 95, 1, 29000] + - [118, 51.551] + - - [2560, 98, 1, 29000] + - [118, 53.265] + - - [512, 200, 1, 32] + - [124, 5.477] + - - [1024, 200, 1, 1] + - [143, 0.541] + - - [512, 200, 1, 1] + - [145, 0.298] + - - [768, 320, 1, 768] + - [169, 29.484] + - - [768, 160, 1, 768] + - [123, 23.948] + - - [1024, 120, 1, 1024] + - [168, 22.983] + - - [1024, 160, 1, 1024] + - [144, 25.451] + - - [2368, 64, 1, 3328] + - [135, 24.467] + - - [64, 3584, 1, 1280] + - [137, 27.968] + - - [1408, 64, 1, 128] + - [122, 13.346] + - - [1408, 64, 1, 1280] + - [122, 18.827] + - - [4096, 32, 1, 4096] + - [128, 25.852] + - - [3072, 64, 1, 1024] + - [123, 30.441] + - - [2944, 64, 1, 256] + - [123, 25.13] + - - [448, 448, 1, 3328] + - [144, 32.493] + - - [1024, 256, 1, 3328] + - [146, 32.173] + - - [6144, 32, 1, 2560] + - [168, 30.788] + - - [1856, 64, 1, 1280] + - [123, 22.834] + - - [704, 128, 1, 1280] + - [135, 19.103] + - - [4288, 64, 1, 3328] + - [146, 33.716] + - - [64, 3584, 1, 3328] + - [154, 28.658] + - - [1760, 128, 1, 1760] + - [125, 27.503] + - - [704, 256, 1, 128] + - [123, 20.172] + - - [128, 1408, 1, 128] + - [168, 19.937] + - - [1024, 256, 1, 256] + - [146, 27.779] + - - [448, 448, 1, 256] + - [123, 26.705] + - - [7680, 32, 1, 2560] + - [135, 28.807] + - - [128, 1024, 1, 3328] + - [162, 28.564] + - - [64, 1856, 1, 1280] + - [153, 25.73] + - - [256, 1024, 1, 256] + - [146, 28.189] + - - [1024, 128, 1, 1280] + - [153, 25.523] + - - [3072, 32, 1, 1024] + - [327, 18.955] + - - [448, 256, 1, 3328] + - [123, 25.171] + - - [128, 1024, 1, 128] + - [135, 16.973] + - - [128, 704, 1, 1280] + - [168, 19.712] + - - [1856, 128, 1, 3328] + - [146, 29.254] + - - [35, 8457, 1, 1760] + - [146, 18.674] + - - [64, 2944, 1, 128] + - [168, 20.767] + - - [8448, 32, 1, 2816] + - [153, 31.447] + - - [1408, 128, 1, 1280] + - [123, 28.234] + - - [128, 1856, 1, 1280] + - [146, 29.069] + - - [256, 448, 1, 256] + - [153, 19.085] + - - [2048, 128, 1, 2048] + - [146, 31.88] + - - [128, 1856, 1, 128] + - [146, 22.491] + - - [64, 1408, 1, 3328] + - [135, 20.181] + - - [128, 1408, 1, 256] + - [135, 23.926] + - - [35, 8457, 1, 2560] + - [154, 18.66] + - - [4288, 64, 1, 128] + - [125, 25.315] + - - [256, 448, 1, 3328] + - [153, 25.672] + - - [64, 2368, 1, 1280] + - [168, 23.98] + - - [2368, 64, 1, 256] + - [123, 20.307] + - - [1024, 128, 1, 128] + - [123, 15.768] + - - [704, 128, 1, 3328] + - [123, 19.978] + - - [4288, 64, 1, 1280] + - [146, 33.057] + - - [2560, 64, 1, 2560] + - [123, 26.398] + - - [1408, 128, 1, 128] + - [123, 20.172] + - - [128, 1024, 1, 1280] + - [153, 28.352] + - - [2944, 64, 1, 128] + - [123, 21.169] + - - [1024, 128, 1, 3328] + - [153, 26.646] + - - [704, 128, 1, 256] + - [122, 15.868] + - - [448, 256, 1, 1280] + - [135, 24.62] + - - [64, 4288, 1, 3328] + - [169, 34.0] + - - [2944, 64, 1, 3328] + - [162, 30.188] + - - [1856, 128, 1, 1280] + - [125, 28.821] + - - [64, 3584, 1, 256] + - [154, 24.715] + - - [3584, 64, 1, 128] + - [123, 21.16] + - - [256, 1024, 1, 1280] + - [146, 32.263] + - - [64, 4288, 1, 128] + - [154, 25.478] + - - [3584, 64, 1, 1280] + - [163, 27.585] + - - [1408, 128, 1, 3328] + - [168, 28.843] + - - [64, 2944, 1, 3328] + - [153, 30.432] + - - [64, 1856, 1, 256] + - [153, 20.0] + - - [128, 1500, 1, 1280] + - [135, 29.822] + - - [35, 8457, 1, 4096] + - [154, 17.988] + - - [256, 704, 1, 256] + - [162, 21.864] + - - [2368, 64, 1, 128] + - [162, 17.027] + - - [256, 1024, 1, 128] + - [146, 24.499] + - - [64, 1408, 1, 128] + - [134, 12.944] + - - [704, 256, 1, 3328] + - [144, 29.231] + - - [35, 8457, 1, 2048] + - [154, 18.647] + - - [64, 2944, 1, 256] + - [168, 25.013] + - - [448, 256, 1, 128] + - [135, 15.051] + - - [64, 1408, 1, 1280] + - [168, 19.667] + - - [1408, 128, 1, 256] + - [123, 24.147] + - - [64, 2944, 1, 1280] + - [168, 29.583] + - - [128, 704, 1, 128] + - [122, 13.206] + - - [256, 448, 1, 1280] + - [135, 24.657] + - - [704, 256, 1, 1280] + - [144, 28.451] + - - [64, 2368, 1, 3328] + - [135, 24.634] + - - [1856, 64, 1, 128] + - [135, 15.177] + - - [4096, 64, 1, 4096] + - [137, 32.006] + - - [704, 128, 1, 128] + - [122, 13.206] + - - [256, 704, 1, 3328] + - [168, 29.173] + - - [256, 448, 1, 128] + - [162, 14.397] + - - [64, 3584, 1, 128] + - [154, 21.503] + - - [1024, 128, 1, 256] + - [123, 19.712] + - - [2944, 64, 1, 1280] + - [123, 29.439] + - - [128, 1408, 1, 3328] + - [123, 29.123] + - - [1408, 64, 1, 256] + - [122, 16.012] + - - [64, 1856, 1, 128] + - [135, 15.66] + - - [64, 2368, 1, 256] + - [168, 20.213] + - - [1856, 128, 1, 128] + - [125, 22.64] + - - [2368, 64, 1, 1280] + - [123, 24.057] + - - [4288, 64, 1, 256] + - [146, 29.403] + - - [64, 4288, 1, 1280] + - [154, 33.522] + - - [1408, 64, 1, 3328] + - [123, 19.382] + - - [1024, 256, 1, 128] + - [146, 24.26] + - - [256, 704, 1, 128] + - [123, 19.937] + - - [448, 448, 1, 1280] + - [162, 31.6] + - - [1024, 256, 1, 1280] + - [125, 32.06] + - - [128, 1024, 1, 256] + - [135, 22.067] + - - [3584, 64, 1, 3328] + - [169, 28.23] + - - [256, 1024, 1, 3328] + - [137, 32.683] + - - [1856, 64, 1, 3328] + - [123, 23.637] + - - [448, 256, 1, 256] + - [153, 19.594] + - - [4608, 32, 1, 1536] + - [149, 27.86] + - - [128, 704, 1, 256] + - [122, 15.768] + - - [3584, 64, 1, 256] + - [123, 24.305] + - - [64, 1856, 1, 3328] + - [135, 26.502] + - - [128, 704, 1, 3328] + - [135, 20.1] + - - [128, 1856, 1, 256] + - [154, 25.595] + - - [64, 4288, 1, 256] + - [154, 29.403] + - - [1856, 64, 1, 256] + - [168, 19.107] + - - [2560, 32, 1, 2560] + - [152, 17.524] + - - [256, 704, 1, 1280] + - [123, 28.361] + - - [64, 2368, 1, 128] + - [135, 16.833] + - - [176, 1500, 1, 1408] + - [146, 32.426] + - - [1856, 128, 1, 256] + - [125, 25.983] + - - [2048, 64, 1, 2048] + - [165, 25.284] + - - [64, 1408, 1, 256] + - [152, 15.674] + - - [128, 1408, 1, 1280] + - [153, 28.311] + - - [128, 1856, 1, 3328] + - [169, 29.719] + - - [1760, 64, 1, 1760] + - [160, 22.229] + - - [448, 448, 1, 128] + - [123, 22.374] + - - [704, 256, 1, 256] + - [162, 24.038] + - - [1024, 256, 1, 1024] + - [259, 36.722] + - - [512, 200, 1, 512] + - [135, 19.486] + - - [1024, 200, 1, 1024] + - [125, 24.706] + - - [512, 256, 1, 1024] + - [168, 27.526] + - - [1024, 256, 1, 2048] + - [125, 32.331] + - - [1024, 200, 1, 4096] + - [146, 25.207] + - - [1024, 200, 1, 512] + - [146, 23.632] + - - [512, 200, 1, 1024] + - [168, 21.543] + - - [512, 256, 1, 512] + - [168, 24.9] + - - [1024, 256, 1, 4096] + - [288, 51.296] + - - [1024, 200, 1, 2048] + - [125, 25.324] + - - [1024, 256, 1, 512] + - [125, 30.156] + - - [512, 200, 1, 2048] + - [135, 21.67] + - - [64, 32, 1984, 32] + - [141, 29.989] + - - [64, 38, 1680, 38] + - [135, 22.252] + - - [64, 59, 1088, 59] + - [135, 33.648] + - - [64, 54, 1184, 54] + - [135, 31.212] + - - [64, 49, 1296, 49] + - [168, 27.612] + - - [64, 45, 1424, 45] + - [135, 26.051] + - - [64, 35, 1808, 35] + - [162, 20.149] + - - [64, 41, 1552, 41] + - [135, 23.195] + - - [512, 512, 1, 1024] + - [169, 31.853] + - - [512, 512, 1, 2000] + - [125, 32.403] + - - [100, 1024, 1, 2048] + - [153, 22.004] + - - [100, 2000, 1, 1024] + - [137, 24.241] + - - [128, 2000, 1, 100] + - [146, 21.71] + - - [64, 2000, 1, 1024] + - [135, 26.709] + - - [100, 1024, 1, 1024] + - [168, 21.602] + - - [128, 1024, 1, 512] + - [135, 25.703] + - - [512, 500, 1, 2000] + - [146, 31.564] + - - [500, 512, 1, 100] + - [125, 21.39] + - - [100, 1024, 1, 500] + - [135, 19.152] + - - [128, 2000, 1, 512] + - [154, 29.159] + - - [256, 1024, 1, 100] + - [146, 21.904] + - - [200, 500, 1, 1024] + - [168, 21.174] + - - [100, 2000, 1, 512] + - [137, 23.032] + - - [200, 512, 1, 100] + - [166, 11.379] + - - [64, 2048, 1, 10] + - [124, 3.212] + - - [64, 2048, 1, 500] + - [168, 24.846] + - - [512, 512, 1, 512] + - [146, 30.707] + - - [500, 500, 1, 2000] + - [125, 30.878] + - - [256, 500, 1, 10] + - [161, 3.009] + - - [512, 500, 1, 512] + - [146, 29.57] + - - [128, 1024, 1, 2000] + - [144, 28.374] + - - [100, 2000, 1, 2048] + - [137, 24.693] + - - [256, 512, 1, 10] + - [145, 3.082] + - - [64, 2000, 1, 2048] + - [168, 26.66] + - - [64, 2048, 1, 512] + - [135, 25.487] + - - [64, 2000, 1, 10] + - [124, 3.14] + - - [128, 1024, 1, 500] + - [135, 24.598] + - - [200, 512, 1, 1024] + - [168, 21.62] + - - [128, 2048, 1, 10] + - [161, 5.477] + - - [64, 2048, 1, 100] + - [168, 14.149] + - - [64, 2000, 1, 100] + - [168, 13.246] + - - [200, 500, 1, 100] + - [159, 10.896] + - - [500, 500, 1, 500] + - [146, 28.834] + - - [128, 2048, 1, 512] + - [154, 30.522] + - - [100, 2048, 1, 500] + - [146, 23.474] + - - [500, 500, 1, 2048] + - [146, 30.675] + - - [128, 2000, 1, 2000] + - [125, 31.55] + - - [256, 500, 1, 1024] + - [168, 26.854] + - - [64, 2048, 1, 2000] + - [135, 28.889] + - - [100, 2048, 1, 1024] + - [137, 24.769] + - - [128, 1024, 1, 100] + - [162, 13.689] + - - [256, 1024, 1, 2048] + - [137, 32.349] + - - [500, 512, 1, 512] + - [154, 29.511] + - - [256, 500, 1, 2000] + - [123, 27.697] + - - [256, 512, 1, 100] + - [162, 13.562] + - - [128, 2000, 1, 500] + - [146, 29.403] + - - [200, 512, 1, 2048] + - [144, 20.925] + - - [64, 2048, 1, 2048] + - [168, 27.499] + - - [200, 1024, 1, 2048] + - [137, 25.306] + - - [512, 512, 1, 10] + - [161, 5.527] + - - [512, 500, 1, 10] + - [161, 5.252] + - - [200, 512, 1, 10] + - [124, 2.567] + - - [500, 500, 1, 1024] + - [137, 30.206] + - - [256, 1024, 1, 512] + - [169, 30.49] + - - [256, 500, 1, 512] + - [168, 24.156] + - - [200, 500, 1, 2048] + - [151, 19.811] + - - [100, 2000, 1, 10] + - [161, 4.065] + - - [100, 2048, 1, 2048] + - [137, 25.284] + - - [128, 1024, 1, 2048] + - [153, 27.968] + - - [100, 2000, 1, 500] + - [125, 22.902] + - - [100, 2048, 1, 100] + - [146, 17.113] + - - [100, 1024, 1, 10] + - [124, 2.459] + - - [100, 1024, 1, 2000] + - [153, 22.125] + - - [256, 512, 1, 500] + - [144, 24.354] + - - [100, 2000, 1, 100] + - [146, 16.77] + - - [128, 1024, 1, 10] + - [161, 3.145] + - - [100, 2048, 1, 10] + - [161, 4.124] + - - [512, 500, 1, 100] + - [146, 21.792] + - - [128, 2000, 1, 1024] + - [137, 30.914] + - - [200, 1024, 1, 500] + - [146, 23.718] + - - [256, 512, 1, 2000] + - [123, 28.419] + - - [256, 1024, 1, 2000] + - [146, 32.548] + - - [200, 512, 1, 500] + - [135, 19.283] + - - [64, 2000, 1, 512] + - [168, 24.237] + - - [200, 1024, 1, 100] + - [146, 17.569] + - - [200, 1024, 1, 1024] + - [137, 24.796] + - - [500, 512, 1, 2000] + - [125, 31.636] + - - [200, 500, 1, 512] + - [153, 18.967] + - - [256, 512, 1, 512] + - [135, 25.23] + - - [512, 512, 1, 500] + - [146, 30.702] + - - [100, 1024, 1, 512] + - [153, 19.743] + - - [128, 1024, 1, 1024] + - [135, 27.878] + - - [200, 512, 1, 2000] + - [144, 21.95] + - - [256, 1024, 1, 500] + - [146, 30.386] + - - [200, 1024, 1, 512] + - [137, 23.844] + - - [256, 500, 1, 500] + - [135, 23.709] + - - [256, 500, 1, 2048] + - [144, 26.759] + - - [512, 500, 1, 1024] + - [169, 31.027] + - - [256, 512, 1, 1024] + - [135, 27.675] + - - [128, 2048, 1, 1024] + - [154, 31.871] + - - [500, 512, 1, 500] + - [146, 29.705] + - - [200, 500, 1, 500] + - [135, 18.769] + - - [64, 2000, 1, 2000] + - [168, 27.806] + - - [128, 2000, 1, 2048] + - [137, 31.591] + - - [256, 1024, 1, 10] + - [161, 5.378] + - - [256, 1024, 1, 1024] + - [154, 31.803] + - - [500, 500, 1, 10] + - [124, 4.99] + - - [256, 500, 1, 100] + - [135, 12.89] + - - [256, 512, 1, 2048] + - [144, 27.828] + - - [200, 1024, 1, 2000] + - [125, 25.396] + - - [100, 2048, 1, 512] + - [137, 23.632] + - - [512, 500, 1, 2048] + - [146, 31.447] + - - [128, 2048, 1, 2000] + - [125, 32.511] + - - [500, 512, 1, 2048] + - [137, 31.438] + - - [200, 500, 1, 2000] + - [144, 21.435] + - - [500, 512, 1, 1024] + - [137, 30.833] + - - [100, 1024, 1, 100] + - [133, 10.548] + - - [64, 2000, 1, 500] + - [135, 24.061] + - - [128, 2048, 1, 2048] + - [137, 32.453] + - - [128, 2000, 1, 10] + - [161, 5.396] + - - [500, 512, 1, 10] + - [124, 5.112] + - - [200, 512, 1, 512] + - [153, 19.355] + - - [512, 500, 1, 500] + - [146, 29.768] + - - [512, 512, 1, 100] + - [154, 22.315] + - - [500, 500, 1, 512] + - [146, 28.677] + - - [128, 2048, 1, 500] + - [146, 30.17] + - - [200, 500, 1, 10] + - [124, 2.509] + - - [100, 2048, 1, 2000] + - [125, 25.365] + - - [200, 1024, 1, 10] + - [124, 4.485] + - - [64, 2048, 1, 1024] + - [135, 27.779] + - - [100, 2000, 1, 2000] + - [125, 24.729] + - - [500, 500, 1, 100] + - [125, 20.506] + - - [128, 2048, 1, 100] + - [146, 22.486] + - - [4096, 64, 1, 2048] + - [146, 31.763] + - - [4096, 91, 1, 2048] + - [153, 28.97] + - - [4096, 86, 1, 3072] + - [144, 27.517] + - - [4096, 49, 1, 2048] + - [146, 24.336] + - - [4096, 91, 1, 3072] + - [144, 29.182] + - - [4096, 64, 1, 3072] + - [137, 31.988] + - - [4096, 63, 1, 3072] + - [125, 31.474] + - - [4096, 96, 1, 2048] + - [168, 30.526] + - - [4096, 32, 1, 2048] + - [149, 25.23] + - - [4096, 49, 1, 3072] + - [125, 24.499] + - - [1024, 96, 1, 1024] + - [123, 19.441] + - - [4096, 86, 1, 2048] + - [153, 27.652] + - - [4096, 96, 1, 3072] + - [168, 30.635] + - - [4096, 35, 1, 3072] + - [146, 17.533] + - - [4096, 50, 1, 2048] + - [137, 24.828] + - - [36548, 32, 1, 1024] + - [162, 36.694] + - - [4096, 32, 1, 3072] + - [128, 25.135] + - - [1024, 243, 1, 1024] + - [125, 29.998] + - - [4096, 50, 1, 3072] + - [137, 25.031] + - - [1024, 128, 1, 1024] + - [153, 25.126] + - - [1024, 216, 1, 1024] + - [125, 26.624] + - - [4096, 35, 1, 2048] + - [125, 17.447] + - - [4096, 63, 1, 2048] + - [125, 31.284] + - - [289, 256, 1, 1568] + - [135, 16.355] + - - [3025, 64, 1, 363] + - [123, 26.822] + - - [784, 32, 32, 192] + - [135, 31.754] + - - [289, 256, 1, 2016] + - [135, 16.698] + - - [21609, 32, 1, 288] + - [162, 32.651] + - - [1225, 192, 1, 1728] + - [125, 28.979] + - - [784, 96, 1, 800] + - [122, 15.38] + - - [1225, 64, 1, 1200] + - [122, 16.378] + - - [729, 192, 1, 1600] + - [162, 28.162] + - - [6272, 32, 1, 528] + - [162, 29.083] + - - [1568, 160, 1, 832] + - [123, 29.299] + - - [289, 256, 1, 1792] + - [133, 16.653] + - - [784, 32, 32, 256] + - [162, 32.286] + - - [6272, 32, 1, 512] + - [123, 29.087] + - - [289, 384, 1, 3456] + - [144, 24.214] + - - [289, 384, 1, 2592] + - [123, 23.962] + - - [1225, 32, 32, 192] + - [162, 34.172] + - - [1568, 128, 1, 832] + - [123, 30.504] + - - [1225, 48, 32, 288] + - [162, 27.341] + - - [1001, 128, 1, 2048] + - [153, 25.965] + - - [2048, 174, 1, 512] + - [125, 28.212] + - - [2048, 189, 1, 512] + - [137, 30.454] + - - [64, 35, 904, 35] + - [162, 18.854] + - - [64, 103, 16, 103] + - [143, 11.04] + - - [64, 104, 16, 103] + - [168, 11.198] + - - [64, 123, 16, 112] + - [168, 13.896] + - - [64, 123, 16, 123] + - [168, 14.564] + - - [512, 540, 1, 512] + - [146, 31.808] + - - [512, 540, 1, 2048] + - [146, 34.244] + - - [512, 550, 1, 512] + - [146, 31.979] + - - [512, 550, 1, 2048] + - [146, 34.853] + - - [512, 560, 1, 512] + - [146, 32.53] + - - [512, 560, 1, 2048] + - [146, 35.417] + - - [2048, 160, 1, 512] + - [123, 30.156] + - - [2048, 184, 1, 512] + - [125, 29.75] + - - [512, 160, 1, 2048] + - [143, 17.474] + - - [512, 174, 1, 2048] + - [135, 19.003] + - - [512, 182, 1, 512] + - [135, 17.528] + - - [512, 184, 1, 512] + - [135, 17.722] + - - [512, 184, 1, 2048] + - [135, 19.874] + - - [512, 189, 1, 512] + - [153, 18.205] + - - [512, 189, 1, 2048] + - [144, 20.813] + - - [512, 198, 1, 2048] + - [135, 21.584] + - - [512, 206, 1, 512] + - [135, 19.806] + - - [512, 207, 1, 2048] + - [135, 22.554] + - - [512, 208, 1, 512] + - [135, 20.0] + - - [512, 208, 1, 2048] + - [135, 22.671] + - - [512, 224, 1, 512] + - [135, 22.189] + - - [512, 245, 1, 2048] + - [123, 26.163] + - - [512, 246, 1, 512] + - [168, 23.614] + - - [512, 246, 1, 2048] + - [123, 26.258] + - - [512, 264, 1, 512] + - [153, 25.058] + - - [512, 264, 1, 2048] + - [123, 26.339] + - - [512, 401, 1, 2048] + - [135, 32.719] + - - [512, 439, 1, 2048] + - [146, 27.824] + - - [512, 443, 1, 2048] + - [146, 28.049] + - - [512, 446, 1, 2048] + - [146, 28.18] + - - [512, 455, 1, 512] + - [146, 27.124] + - - [512, 465, 1, 512] + - [146, 27.693] + - - [512, 465, 1, 2048] + - [146, 29.489] + - - [512, 468, 1, 512] + - [146, 27.869] + - - [512, 468, 1, 2048] + - [146, 29.656] + - - [512, 476, 1, 512] + - [146, 28.374] + - - [512, 493, 1, 512] + - [146, 29.182] + - - [512, 493, 1, 2048] + - [146, 31.14] + - - [512, 495, 1, 2048] + - [146, 31.239] + - - [512, 511, 1, 2048] + - [146, 32.209] + - - [512, 512, 1, 2048] + - [146, 32.453] + - - [64, 59, 512, 59] + - [144, 30.743] + - - [64, 59, 544, 59] + - [144, 30.892] + - - [256, 1024, 1, 1] + - [161, 0.681] + - - [257, 1024, 1, 4096] + - [151, 26.764] + - - [512, 215, 1, 2048] + - [135, 23.213] + - - [512, 256, 1, 2048] + - [123, 27.35] + - - [560, 200, 1, 1024] + - [168, 23.348] + - - [768, 215, 1, 2048] + - [144, 26.258] + - - [768, 256, 1, 2048] + - [144, 31.334] + - - [32, 33, 1600, 33] + - [121, 14.374] + - - [512, 512, 1, 64] + - [146, 19.608] + - - [1225, 32, 64, 192] + - [162, 36.112] + - - [1225, 48, 64, 192] + - [153, 27.919] + - - [1225, 48, 64, 256] + - [162, 27.973] + - - [1225, 48, 64, 288] + - [123, 28.013] + - - [49, 2048, 64, 512] + - [135, 30.364] + - - [49, 512, 64, 2048] + - [141, 25.636] + - - [1225, 48, 32, 192] + - [135, 27.012] + - - [1225, 48, 32, 256] + - [123, 27.269] + - - [49, 2048, 32, 512] + - [135, 29.619] + - - [49, 512, 32, 2048] + - [135, 27.07] + - - [384, 384, 1, 384] + - [135, 24.237] + - - [100, 128, 18, 512] + - [169, 25.988] + - - [100, 128, 19, 512] + - [168, 21.724] + - - [1444, 128, 1, 576] + - [123, 27.228] + - - [361, 512, 1, 2304] + - [144, 29.597] + - - [480, 512, 1, 512] + - [137, 28.442] + - - [512, 480, 1, 512] + - [146, 28.528] + - - [1024, 308, 1, 1024] + - [123, 29.2] + - - [1024, 180, 1, 1024] + - [135, 28.424] + - - [64, 32, 4608, 32] + - [172, 32.381] + - - [64, 34, 4736, 34] + - [123, 20.19] + - - [64, 35, 4608, 32] + - [135, 21.652] + - - [64, 35, 4608, 35] + - [123, 20.925] + - - [256, 864, 1, 128] + - [123, 20.736] + - - [49, 2048, 64, 1024] + - [168, 28.925] + - - [49, 1024, 64, 2048] + - [158, 26.385] + - - [49, 2048, 32, 1024] + - [168, 28.0] + - - [49, 1024, 32, 2048] + - [141, 25.622] + - - [3136, 64, 1, 576] + - [123, 28.501] + - - [784, 128, 1, 1152] + - [123, 20.488] + - - [49, 2048, 128, 512] + - [135, 30.72] + - - [49, 2048, 256, 512] + - [153, 30.937] + - - [49, 512, 128, 2048] + - [141, 26.353] + - - [49, 512, 256, 2048] + - [141, 26.61] + - - [1024, 128, 1, 2] + - [161, 0.641] + - - [1024, 96, 1, 2] + - [161, 0.559] + - - [1909283, 40, 1, 40] + - [161, 15.606] + - - [3818566, 40, 1, 40] + - [152, 15.669] + - - [2560, 35, 1, 29000] + - [143, 13.589] + - - [2560, 36, 1, 29000] + - [143, 13.995] + - - [2560, 39, 1, 29000] + - [143, 15.15] + - - [2560, 40, 1, 29000] + - [143, 15.529] + - - [2560, 42, 1, 29000] + - [143, 16.337] + - - [2560, 43, 1, 29000] + - [143, 16.729] + - - [2560, 44, 1, 29000] + - [143, 17.086] + - - [2560, 46, 1, 29000] + - [143, 17.848] + - - [2560, 48, 1, 29000] + - [143, 18.611] + - - [2560, 49, 1, 29000] + - [123, 18.439] + - - [2560, 50, 1, 29000] + - [123, 18.778] + - - [2560, 51, 1, 29000] + - [123, 19.206] + - - [2560, 53, 1, 29000] + - [123, 19.906] + - - [2560, 54, 1, 29000] + - [123, 20.307] + - - [2560, 55, 1, 29000] + - [123, 20.601] + - - [2560, 56, 1, 29000] + - [123, 21.029] + - - [2560, 57, 1, 29000] + - [123, 21.435] + - - [2560, 58, 1, 29000] + - [123, 21.765] + - - [2560, 59, 1, 29000] + - [123, 22.211] + - - [2560, 61, 1, 29000] + - [123, 22.911] + - - [2560, 63, 1, 29000] + - [123, 23.511] + - - [1760, 32, 1, 1760] + - [177, 15.94] + - - [3584, 4, 1, 1280] + - [181, 5.193] + - - [2560, 16, 1, 2560] + - [182, 13.824] + - - [2944, 4, 1, 256] + - [176, 2.766] + - - [5056, 4, 1, 3328] + - [180, 7.557] + - - [1760, 16, 1, 1760] + - [181, 10.508] + - - [2368, 4, 1, 1280] + - [181, 3.451] + - - [6784, 4, 1, 1280] + - [183, 8.5] + - - [8448, 4, 1, 2816] + - [183, 10.765] + - - [1856, 4, 1, 1280] + - [181, 2.897] + - - [4608, 1, 1, 1536] + - [181, 1.701] + - - [7680, 4, 1, 2560] + - [180, 10.224] + - - [8448, 16, 1, 2816] + - [201, 25.794] + - - [3072, 2, 1, 1024] + - [181, 2.161] + - - [2368, 4, 1, 256] + - [176, 2.269] + - - [7680, 1, 1, 2560] + - [180, 2.518] + - - [4608, 2, 1, 1536] + - [181, 3.402] + - - [4608, 4, 1, 1536] + - [181, 6.786] + - - [3072, 1, 1, 128] + - [140, 0.505] + - - [2048, 32, 1, 2048] + - [177, 19.139] + - - [4288, 4, 1, 256] + - [178, 3.853] + - - [3584, 4, 1, 3328] + - [180, 5.694] + - - [5888, 4, 1, 1280] + - [182, 7.48] + - - [2048, 16, 1, 2048] + - [181, 12.493] + - - [5888, 4, 1, 128] + - [178, 3.618] + - - [8448, 1, 1, 2816] + - [192, 2.784] + - - [1408, 4, 1, 256] + - [148, 1.408] + - - [6144, 4, 1, 2560] + - [180, 8.473] + - - [3072, 1, 1, 1024] + - [181, 1.078] + - - [5056, 4, 1, 1280] + - [202, 6.799] + - - [3072, 16, 1, 1024] + - [182, 15.2] + - - [1408, 4, 1, 3328] + - [181, 2.504] + - - [6144, 1, 1, 2560] + - [180, 2.134] + - - [6144, 16, 1, 2560] + - [189, 23.217] + - - [4096, 16, 1, 4096] + - [201, 19.64] + - - [1408, 4, 1, 128] + - [164, 0.984] + - - [1856, 4, 1, 256] + - [148, 1.818] + - - [6784, 4, 1, 128] + - [176, 4.124] + - - [2944, 4, 1, 128] + - [164, 1.99] + - - [5888, 4, 1, 3328] + - [204, 8.225] + - - [5056, 4, 1, 128] + - [184, 3.208] + - - [3072, 4, 1, 1024] + - [181, 4.309] + - - [2944, 4, 1, 3328] + - [180, 4.692] + - - [2368, 4, 1, 128] + - [176, 1.611] + - - [1856, 4, 1, 128] + - [164, 1.286] + - - [7680, 2, 1, 2560] + - [190, 5.062] + - - [7680, 16, 1, 2560] + - [201, 23.28] + - - [4224, 1, 1, 128] + - [178, 0.672] + - - [8448, 2, 1, 2816] + - [190, 5.671] + - - [1408, 4, 1, 1280] + - [181, 2.202] + - - [6784, 4, 1, 256] + - [176, 5.599] + - - [4288, 4, 1, 128] + - [184, 2.797] + - - [1856, 4, 1, 3328] + - [181, 3.298] + - - [3584, 4, 1, 256] + - [184, 3.379] + - - [2368, 4, 1, 3328] + - [180, 3.781] + - - [6784, 4, 1, 3328] + - [183, 9.393] + - - [4288, 4, 1, 1280] + - [181, 6.159] + - - [3584, 4, 1, 128] + - [184, 2.378] + - - [5056, 4, 1, 256] + - [195, 4.39] + - - [4288, 4, 1, 3328] + - [181, 6.759] + - - [4608, 16, 1, 1536] + - [179, 20.853] + - - [6144, 2, 1, 2560] + - [180, 4.246] + - - [2944, 4, 1, 1280] + - [181, 4.268] + - - [5888, 4, 1, 256] + - [178, 5.017] + - - [4096, 29, 1, 2048] + - [201, 22.527] + - - [4096, 25, 1, 2048] + - [189, 19.477] + - - [4096, 29, 1, 3072] + - [201, 22.852] + - - [4096, 24, 1, 2048] + - [201, 23.136] + - - [36548, 1, 1, 1024] + - [174, 1.845] + - - [4096, 27, 1, 2048] + - [201, 20.596] + - - [4096, 1, 1, 2048] + - [181, 1.557] + - - [4096, 24, 1, 3072] + - [201, 23.502] + - - [4096, 27, 1, 3072] + - [201, 21.259] + - - [36548, 25, 1, 1024] + - [194, 22.162] + - - [4096, 1, 1, 3072] + - [181, 1.552] + - - [4096, 25, 1, 3072] + - [201, 19.667] + - - [36548, 24, 1, 1024] + - [189, 25.766] + - - [6272, 16, 1, 480] + - [189, 18.354] + - - [1568, 32, 1, 832] + - [127, 14.853] + - - [1568, 48, 1, 832] + - [187, 16.797] + - - [6272, 24, 1, 512] + - [179, 21.571] + - - [2048, 1, 1, 512] + - [127, 0.623] + - - [2048, 2, 1, 2] + - [173, 0.023] + - - [2048, 2, 1, 2048] + - [181, 1.723] + - - [2560, 4, 1, 2] + - [173, 0.063] + - - [2560, 4, 1, 2560] + - [181, 3.997] + - - [12288, 12, 2, 256] + - [177, 18.755] + - - [12288, 3, 2, 256] + - [174, 6.925] + - - [51520, 12, 2, 256] + - [201, 21.706] + - - [51520, 3, 2, 256] + - [184, 9.944] + - - [15200, 12, 2, 256] + - [201, 19.012] + - - [15200, 3, 2, 256] + - [193, 7.273] + - - [3456, 3, 2, 256] + - [175, 4.16] + - - [13600, 12, 2, 256] + - [199, 19.058] + - - [12880, 3, 2, 256] + - [174, 6.898] + - - [3400, 3, 2, 256] + - [178, 4.061] + - - [12880, 12, 2, 256] + - [189, 18.214] + - - [13824, 12, 2, 256] + - [199, 19.432] + - - [13824, 3, 2, 256] + - [205, 7.268] + - - [13600, 3, 2, 256] + - [174, 7.16] + - - [3456, 12, 2, 256] + - [188, 13.052] + - - [3800, 3, 2, 256] + - [178, 4.48] + - - [3400, 12, 2, 256] + - [200, 12.773] + - - [3800, 12, 2, 256] + - [201, 13.607] + - - [55296, 3, 2, 256] + - [184, 10.679] + - - [3220, 3, 2, 256] + - [178, 3.916] + - - [3072, 3, 2, 256] + - [175, 3.83] + - - [3220, 12, 2, 256] + - [189, 13.165] + - - [3072, 12, 2, 256] + - [201, 12.673] + - - [54400, 3, 2, 256] + - [196, 10.729] + - - [60800, 12, 2, 256] + - [197, 21.692] + - - [60800, 3, 2, 256] + - [198, 10.661] + - - [1909283, 11, 1, 11] + - [185, 9.199] + - - [3818566, 11, 1, 11] + - [186, 5.076] + - - [2048, 8, 1, 2] + - [143, 0.081] + - - [2048, 8, 1, 2048] + - [203, 6.524] + - - [2560, 2, 1, 2] + - [173, 0.032] + - - [2560, 2, 1, 2560] + - [181, 1.999] + - - [2560, 27, 1, 29000] + - [191, 16.355] + - - [4, 1856, 1, 3328] + - [155, 2.45] + - - [35, 1500, 1, 2560] + - [132, 12.601] + - - [4, 2368, 1, 1280] + - [132, 2.761] + - - [4, 3584, 1, 128] + - [208, 2.179] + - - [4, 1408, 1, 3328] + - [155, 2.026] + - - [4, 6784, 1, 3328] + - [132, 4.232] + - - [4, 4288, 1, 128] + - [208, 2.54] + - - [4, 6784, 1, 1280] + - [132, 4.124] + - - [4, 5056, 1, 256] + - [208, 3.339] + - - [4, 2944, 1, 3328] + - [155, 3.257] + - - [4, 5056, 1, 1280] + - [208, 4.043] + - - [35, 1500, 1, 2048] + - [150, 12.294] + - - [4, 2368, 1, 3328] + - [155, 2.91] + - - [4, 1856, 1, 256] + - [132, 1.737] + - - [4, 2944, 1, 256] + - [132, 2.527] + - - [4, 6784, 1, 128] + - [207, 3.014] + - - [4, 3584, 1, 1280] + - [132, 3.528] + - - [4, 5888, 1, 256] + - [206, 3.542] + - - [4, 5888, 1, 3328] + - [207, 4.421] + - - [4, 6784, 1, 256] + - [206, 3.307] + - - [4, 1408, 1, 1280] + - [155, 1.927] + - - [4, 3584, 1, 256] + - [132, 2.806] + - - [4, 2944, 1, 1280] + - [132, 3.136] + - - [4, 1408, 1, 256] + - [132, 1.354] + - - [4, 4288, 1, 3328] + - [132, 4.142] + - - [4, 2368, 1, 128] + - [132, 1.588] + - - [4, 5888, 1, 1280] + - [207, 4.295] + - - [4, 1856, 1, 1280] + - [155, 2.346] + - - [4, 1856, 1, 128] + - [132, 1.268] + - - [4, 2944, 1, 128] + - [207, 1.922] + - - [4, 4288, 1, 1280] + - [132, 3.979] + - - [4, 5056, 1, 3328] + - [206, 4.124] + - - [4, 5056, 1, 128] + - [207, 2.716] + - - [4, 4288, 1, 256] + - [206, 3.208] + - - [4, 3584, 1, 3328] + - [132, 3.618] + - - [4, 2368, 1, 256] + - [132, 2.102] + - - [4, 5888, 1, 128] + - [207, 2.955] + - - [4, 1408, 1, 128] + - [132, 0.984] + - - [16, 2000, 1, 2048] + - [170, 10.336] + - - [2, 2048, 1, 2000] + - [126, 1.299] + - - [32, 2000, 1, 2048] + - [134, 16.202] + - - [10, 2000, 1, 1024] + - [155, 6.127] + - - [2, 2000, 1, 100] + - [120, 0.568] + - - [10, 2000, 1, 512] + - [155, 5.608] + - - [32, 2000, 1, 500] + - [122, 14.794] + - - [32, 2000, 1, 1024] + - [152, 16.071] + - - [4, 2048, 1, 500] + - [126, 2.166] + - - [16, 2000, 1, 500] + - [147, 8.455] + - - [4, 2048, 1, 100] + - [132, 1.164] + - - [16, 2000, 1, 100] + - [147, 4.485] + - - [4, 2000, 1, 10] + - [124, 0.217] + - - [10, 2000, 1, 10] + - [124, 0.537] + - - [2, 2048, 1, 512] + - [155, 1.164] + - - [10, 2048, 1, 100] + - [124, 2.802] + - - [8, 2048, 1, 100] + - [132, 2.31] + - - [2, 2048, 1, 1024] + - [155, 1.259] + - - [16, 2000, 1, 1024] + - [155, 9.881] + - - [10, 2000, 1, 2000] + - [126, 6.334] + - - [8, 2000, 1, 500] + - [147, 4.227] + - - [16, 2000, 1, 2000] + - [126, 10.174] + - - [10, 2048, 1, 2048] + - [170, 6.655] + - - [8, 2000, 1, 512] + - [155, 4.507] + - - [2, 2000, 1, 2048] + - [155, 1.304] + - - [16, 2048, 1, 500] + - [126, 8.635] + - - [8, 2048, 1, 1024] + - [155, 5.008] + - - [2, 2000, 1, 500] + - [147, 1.06] + - - [32, 2048, 1, 100] + - [136, 9.069] + - - [10, 2048, 1, 500] + - [126, 5.382] + - - [4, 2000, 1, 2048] + - [170, 2.594] + - - [8, 2000, 1, 1024] + - [155, 4.913] + - - [32, 2048, 1, 512] + - [152, 15.2] + - - [32, 2048, 1, 1024] + - [134, 16.382] + - - [32, 2048, 1, 500] + - [122, 15.209] + - - [10, 2048, 1, 1024] + - [155, 6.289] + - - [8, 2048, 1, 2048] + - [155, 5.292] + - - [16, 2048, 1, 2048] + - [170, 10.603] + - - [8, 2000, 1, 10] + - [124, 0.447] + - - [4, 2000, 1, 2000] + - [126, 2.531] + - - [8, 2048, 1, 512] + - [155, 4.584] + - - [8, 2000, 1, 2048] + - [155, 5.175] + - - [32, 2048, 1, 2000] + - [152, 17.28] + - - [16, 2000, 1, 10] + - [124, 0.88] + - - [8, 2048, 1, 2000] + - [126, 5.184] + - - [4, 2048, 1, 2048] + - [155, 2.644] + - - [10, 2048, 1, 2000] + - [126, 6.488] + - - [8, 2000, 1, 100] + - [126, 2.269] + - - [2, 2000, 1, 2000] + - [126, 1.268] + - - [16, 2048, 1, 1024] + - [155, 10.052] + - - [32, 2000, 1, 2000] + - [134, 16.937] + - - [32, 2048, 1, 2048] + - [167, 16.481] + - - [2, 2048, 1, 10] + - [124, 0.113] + - - [4, 2048, 1, 512] + - [155, 2.319] + - - [4, 2048, 1, 10] + - [124, 0.226] + - - [16, 2048, 1, 100] + - [147, 4.561] + - - [4, 2000, 1, 500] + - [147, 2.116] + - - [10, 2000, 1, 500] + - [155, 5.261] + - - [32, 2000, 1, 512] + - [152, 14.875] + - - [2, 2000, 1, 1024] + - [155, 1.232] + - - [2, 2000, 1, 512] + - [155, 1.128] + - - [4, 2048, 1, 1024] + - [155, 2.527] + - - [8, 2048, 1, 500] + - [147, 4.318] + - - [4, 2048, 1, 2000] + - [126, 2.594] + - - [8, 2000, 1, 2000] + - [126, 5.076] + - - [4, 2000, 1, 1024] + - [155, 2.459] + - - [32, 2000, 1, 100] + - [124, 8.748] + - - [2, 2048, 1, 100] + - [132, 0.582] + - - [8, 2048, 1, 10] + - [124, 0.451] + - - [2, 2048, 1, 2048] + - [170, 1.322] + - - [10, 2000, 1, 2048] + - [155, 6.497] + - - [16, 2048, 1, 2000] + - [126, 10.404] + - - [10, 2048, 1, 512] + - [155, 5.784] + - - [16, 2048, 1, 512] + - [155, 9.276] + - - [2, 2000, 1, 10] + - [124, 0.108] + - - [4, 2000, 1, 100] + - [132, 1.128] + - - [16, 2000, 1, 512] + - [155, 9.06] + - - [32, 2048, 1, 10] + - [124, 1.782] + - - [10, 2048, 1, 10] + - [124, 0.555] + - - [4, 2000, 1, 512] + - [155, 2.26] + - - [16, 2048, 1, 10] + - [122, 0.889] + - - [32, 2000, 1, 10] + - [129, 1.742] + - - [10, 2000, 1, 100] + - [132, 2.752] + - - [2, 2048, 1, 500] + - [126, 1.078] + - - [1024, 1, 1, 500000] + - [211, 0.848] + - - [1024, 16, 1, 500000] + - [212, 13.052] + - - [1024, 2, 1, 500000] + - [213, 1.692] + - - [512, 1, 1, 500000] + - [214, 0.753] + - - [1024, 8, 1, 500000] + - [213, 6.632] + - - [1024, 4, 1, 500000] + - [216, 3.361] + - - [512, 16, 1, 500000] + - [214, 11.875] + - - [512, 2, 1, 500000] + - [215, 1.516] + - - [512, 8, 1, 500000] + - [215, 6.005] + - - [512, 4, 1, 500000] + - [213, 3.023] + - - [1024, 20, 1, 30522] + - [210, 15.444] + - - [49, 512, 1, 4608] + - [209, 14.726] + - - [64, 512, 1, 1] + - [125, 0.086] + - - [1024, 32, 1, 2] + - [129, 0.199] + - - [1024, 32, 1, 1024] + - [127, 11.13] + - - [768, 32, 1, 768] + - [127, 8.617] + - - [768, 32, 1, 2] + - [161, 0.153] + - - [768, 64, 1, 768] + - [148, 14.361] + - - [768, 64, 1, 2] + - [161, 0.298] + - - [1024, 20, 1, 1024] + - [127, 7.169] + - - [1024, 80, 1, 1024] + - [122, 17.032] + - - [32, 200, 1, 1] + - [120, 0.018] + - - [1024, 4, 1, 1024] + - [148, 1.498] + - - [1024, 4, 1, 2] + - [120, 0.023] + - - [768, 16, 1, 768] + - [164, 4.453] + - - [768, 16, 1, 2] + - [143, 0.077] + - - [768, 8, 1, 768] + - [164, 2.193] + - - [1024, 6, 1, 1024] + - [148, 2.251] + - - [1024, 6, 1, 2] + - [120, 0.036] + - - [1024, 8, 1, 1024] + - [148, 3.005] + - - [4, 704, 1, 1280] + - [157, 1.038] + - - [512, 4, 1, 512] + - [148, 0.663] + - - [64, 4, 1, 256] + - [126, 0.063] + - - [64, 704, 1, 128] + - [134, 7.981] + - - [448, 64, 1, 1280] + - [127, 10.792] + - - [128, 4, 1, 1280] + - [140, 0.199] + - - [128, 256, 1, 256] + - [148, 8.446] + - - [64, 1024, 1, 1280] + - [134, 16.838] + - - [64, 704, 1, 1280] + - [152, 13.973] + - - [64, 64, 1, 1280] + - [127, 1.593] + - - [1024, 64, 1, 128] + - [124, 10.454] + - - [64, 1024, 1, 3328] + - [134, 17.542] + - - [128, 1, 1, 1408] + - [126, 0.05] + - - [1024, 64, 1, 1280] + - [161, 16.513] + - - [704, 4, 1, 1280] + - [148, 1.087] + - - [64, 256, 1, 128] + - [134, 3.054] + - - [256, 256, 1, 3328] + - [134, 17.293] + - - [64, 1024, 1, 128] + - [134, 10.63] + - - [128, 256, 1, 3328] + - [164, 13.052] + - - [64, 448, 1, 1280] + - [148, 10.562] + - - [448, 4, 1, 256] + - [148, 0.447] + - - [256, 4, 1, 1280] + - [127, 0.393] + - - [512, 32, 1, 512] + - [127, 5.392] + - - [64, 64, 1, 3328] + - [164, 1.719] + - - [512, 1, 1, 512] + - [148, 0.167] + - - [704, 64, 1, 3328] + - [148, 14.744] + - - [256, 4, 1, 256] + - [148, 0.257] + - - [256, 64, 1, 1280] + - [140, 6.42] + - - [1024, 4, 1, 256] + - [148, 1.047] + - - [4, 704, 1, 256] + - [132, 0.69] + - - [704, 64, 1, 1280] + - [148, 13.991] + - - [128, 448, 1, 256] + - [122, 11.911] + - - [128, 256, 1, 1280] + - [164, 12.146] + - - [448, 64, 1, 3328] + - [164, 11.505] + - - [256, 128, 1, 128] + - [148, 6.028] + - - [4, 448, 1, 128] + - [134, 0.32] + - - [64, 128, 1, 3328] + - [171, 3.465] + - - [128, 128, 1, 3328] + - [127, 6.971] + - - [256, 128, 1, 256] + - [157, 8.41] + - - [64, 1, 1, 1216] + - [126, 0.023] + - - [1024, 4, 1, 3328] + - [131, 1.692] + - - [4, 4, 1, 256] + - [120, 0.005] + - - [256, 64, 1, 256] + - [127, 4.507] + - - [256, 128, 1, 1280] + - [148, 12.209] + - - [128, 64, 1, 1280] + - [157, 3.221] + - - [4, 448, 1, 3328] + - [171, 0.753] + - - [64, 1024, 1, 256] + - [167, 13.422] + - - [64, 704, 1, 256] + - [134, 10.409] + - - [704, 64, 1, 128] + - [134, 7.981] + - - [448, 4, 1, 1280] + - [148, 0.686] + - - [1024, 2, 1, 512] + - [148, 0.659] + - - [256, 64, 1, 3328] + - [148, 6.948] + - - [448, 128, 1, 256] + - [122, 11.911] + - - [448, 64, 1, 128] + - [148, 5.306] + - - [4, 448, 1, 256] + - [134, 0.442] + - - [64, 704, 1, 3328] + - [148, 14.744] + - - [256, 256, 1, 256] + - [167, 13.278] + - - [4, 1024, 1, 3328] + - [140, 1.597] + - - [4, 704, 1, 128] + - [132, 0.501] + - - [64, 128, 1, 128] + - [127, 1.534] + - - [704, 4, 1, 128] + - [148, 0.492] + - - [64, 448, 1, 3328] + - [148, 11.315] + - - [448, 4, 1, 3328] + - [140, 0.744] + - - [256, 4, 1, 3328] + - [131, 0.424] + - - [4, 256, 1, 256] + - [132, 0.248] + - - [4, 64, 1, 1280] + - [127, 0.099] + - - [4, 4, 1, 128] + - [120, 0.005] + - - [4, 128, 1, 256] + - [132, 0.126] + - - [448, 128, 1, 3328] + - [143, 15.665] + - - [64, 448, 1, 256] + - [148, 7.39] + - - [64, 256, 1, 1280] + - [157, 6.411] + - - [1024, 32, 1, 512] + - [127, 10.228] + - - [64, 4, 1, 128] + - [127, 0.045] + - - [256, 64, 1, 128] + - [126, 3.072] + - - [64, 64, 1, 256] + - [157, 1.083] + - - [4, 704, 1, 3328] + - [140, 1.11] + - - [4, 4, 1, 1280] + - [120, 0.005] + - - [128, 128, 1, 128] + - [127, 3.285] + - - [1024, 4, 1, 128] + - [148, 0.726] + - - [4, 64, 1, 128] + - [132, 0.045] + - - [64, 128, 1, 1280] + - [127, 3.194] + - - [128, 128, 1, 1280] + - [140, 6.429] + - - [512, 2, 1, 512] + - [140, 0.329] + - - [64, 128, 1, 256] + - [157, 2.179] + - - [1024, 4, 1, 1280] + - [148, 1.543] + - - [35, 700, 1, 2048] + - [170, 8.18] + - - [704, 64, 1, 256] + - [152, 10.575] + - - [128, 448, 1, 1280] + - [161, 15.011] + - - [128, 64, 1, 3328] + - [127, 3.492] + - - [448, 64, 1, 256] + - [148, 7.458] + - - [1024, 16, 1, 512] + - [148, 5.392] + - - [4, 256, 1, 128] + - [132, 0.18] + - - [512, 16, 1, 512] + - [140, 2.689] + - - [1024, 64, 1, 256] + - [161, 13.278] + - - [4, 4, 1, 3328] + - [120, 0.005] + - - [4, 1024, 1, 1280] + - [157, 1.493] + - - [704, 4, 1, 256] + - [148, 0.708] + - - [128, 64, 1, 256] + - [157, 2.17] + - - [128, 4, 1, 3328] + - [164, 0.217] + - - [128, 4, 1, 128] + - [132, 0.09] + - - [128, 1, 1, 1024] + - [127, 0.05] + - - [4, 128, 1, 3328] + - [127, 0.217] + - - [256, 256, 1, 128] + - [124, 10.693] + - - [704, 4, 1, 3328] + - [127, 1.173] + - - [448, 128, 1, 1280] + - [143, 15.011] + - - [1024, 64, 1, 3328] + - [161, 17.199] + - - [256, 4, 1, 128] + - [164, 0.18] + - - [4, 1024, 1, 128] + - [134, 0.726] + - - [64, 256, 1, 3328] + - [148, 6.844] + - - [448, 128, 1, 128] + - [122, 9.569] + - - [128, 256, 1, 128] + - [148, 6.064] + - - [128, 4, 1, 256] + - [127, 0.126] + - - [256, 256, 1, 1280] + - [143, 16.973] + - - [256, 128, 1, 3328] + - [164, 13.17] + - - [4, 448, 1, 1280] + - [140, 0.69] + - - [448, 4, 1, 128] + - [138, 0.307] + - - [4, 256, 1, 3328] + - [127, 0.429] + - - [4, 128, 1, 128] + - [132, 0.09] + - - [4, 256, 1, 1280] + - [140, 0.393] + - - [64, 4, 1, 3328] + - [148, 0.108] + - - [4, 64, 1, 3328] + - [127, 0.108] + - - [35, 700, 1, 2560] + - [126, 8.338] + - - [4, 1024, 1, 256] + - [140, 1.002] + - - [64, 256, 1, 256] + - [157, 4.322] + - - [1024, 4, 1, 512] + - [148, 1.331] + - - [4, 64, 1, 256] + - [140, 0.063] + - - [128, 448, 1, 128] + - [122, 9.741] + - - [64, 448, 1, 128] + - [134, 5.306] + - - [128, 448, 1, 3328] + - [122, 15.683] + - - [4, 128, 1, 1280] + - [140, 0.199] + - - [128, 64, 1, 128] + - [148, 1.534] + - - [64, 64, 1, 128] + - [148, 0.767] + - - [64, 4, 1, 1280] + - [148, 0.099] + - - [1024, 1, 1, 512] + - [148, 0.334] + - - [128, 128, 1, 256] + - [157, 4.34] + - - [64, 12, 5040, 12] + - [161, 12.967] + - - [64, 17, 3632, 17] + - [162, 16.874] + - - [64, 19, 3264, 19] + - [162, 19.224] + - - [64, 9, 6544, 9] + - [122, 8.798] + - - [64, 7, 8192, 7] + - [135, 6.186] + - - [64, 16, 3840, 16] + - [161, 18.873] + - - [64, 8, 7280, 8] + - [168, 7.923] + - - [64, 27, 2336, 27] + - [162, 26.353] + - - [64, 11, 5456, 11] + - [122, 11.667] + - - [64, 21, 2976, 21] + - [168, 21.557] + - - [64, 10, 5952, 10] + - [122, 10.278] + - - [64, 14, 4368, 14] + - [161, 15.606] + - - [64, 25, 2512, 25] + - [168, 24.666] + - - [64, 13, 4672, 13] + - [161, 14.284] + - - [64, 15, 4096, 15] + - [161, 16.865] + - - [64, 29, 2176, 29] + - [162, 27.725] + - - [64, 18, 3440, 18] + - [144, 18.602] + - - [64, 23, 2720, 23] + - [168, 23.109] + - - [8, 500, 1, 512] + - [140, 1.268] + - - [32, 512, 1, 512] + - [157, 5.423] + - - [8, 512, 1, 500] + - [157, 1.286] + - - [8, 500, 1, 1024] + - [140, 1.493] + - - [64, 1024, 1, 100] + - [124, 9.181] + - - [64, 1024, 1, 500] + - [143, 15.304] + - - [64, 1024, 1, 1024] + - [167, 16.526] + - - [2, 500, 1, 2048] + - [140, 0.406] + - - [16, 512, 1, 10] + - [126, 0.239] + - - [8, 512, 1, 10] + - [126, 0.113] + - - [16, 500, 1, 2048] + - [157, 3.248] + - - [10, 100, 1, 500] + - [157, 0.316] + - - [16, 100, 1, 10] + - [129, 0.045] + - - [2, 100, 1, 2000] + - [127, 0.081] + - - [256, 100, 1, 2048] + - [148, 9.989] + - - [2, 512, 1, 512] + - [140, 0.325] + - - [2, 100, 1, 10] + - [120, 0.005] + - - [200, 100, 1, 100] + - [167, 3.027] + - - [500, 100, 1, 100] + - [132, 7.007] + - - [4, 100, 1, 10] + - [120, 0.009] + - - [32, 100, 1, 512] + - [148, 1.02] + - - [16, 1024, 1, 512] + - [157, 5.103] + - - [4, 1024, 1, 1024] + - [157, 1.457] + - - [4, 512, 1, 10] + - [126, 0.059] + - - [128, 100, 1, 10] + - [129, 0.352] + - - [4, 512, 1, 2048] + - [140, 0.83] + - - [10, 1024, 1, 2000] + - [148, 3.826] + - - [256, 100, 1, 100] + - [166, 3.749] + - - [64, 1024, 1, 2048] + - [167, 17.068] + - - [16, 1024, 1, 100] + - [148, 2.387] + - - [32, 1024, 1, 1024] + - [157, 11.735] + - - [8, 100, 1, 500] + - [127, 0.253] + - - [10, 512, 1, 512] + - [140, 1.602] + - - [8, 500, 1, 10] + - [122, 0.108] + - - [16, 1024, 1, 10] + - [126, 0.456] + - - [16, 512, 1, 2048] + - [157, 3.325] + - - [128, 512, 1, 2048] + - [134, 17.203] + - - [128, 512, 1, 100] + - [124, 9.014] + - - [64, 500, 1, 2048] + - [157, 12.218] + - - [500, 100, 1, 10] + - [129, 1.295] + - - [64, 100, 1, 2048] + - [157, 2.59] + - - [64, 100, 1, 10] + - [129, 0.176] + - - [16, 512, 1, 500] + - [127, 2.554] + - - [200, 100, 1, 2000] + - [148, 7.819] + - - [2, 100, 1, 512] + - [127, 0.063] + - - [32, 512, 1, 100] + - [132, 2.463] + - - [16, 512, 1, 1024] + - [140, 3.032] + - - [4, 1024, 1, 512] + - [157, 1.277] + - - [2, 500, 1, 500] + - [127, 0.316] + - - [32, 100, 1, 100] + - [132, 0.483] + - - [100, 500, 1, 2000] + - [122, 13.192] + - - [10, 512, 1, 10] + - [126, 0.14] + - - [100, 500, 1, 2048] + - [152, 13.17] + - - [2, 100, 1, 1024] + - [127, 0.072] + - - [32, 512, 1, 1024] + - [171, 6.226] + - - [256, 100, 1, 1024] + - [164, 9.267] + - - [128, 100, 1, 100] + - [132, 1.963] + - - [32, 512, 1, 10] + - [129, 0.469] + - - [128, 100, 1, 1024] + - [148, 4.814] + - - [16, 500, 1, 2000] + - [148, 3.167] + - - [64, 500, 1, 500] + - [148, 9.754] + - - [128, 512, 1, 1024] + - [134, 16.599] + - - [128, 512, 1, 2000] + - [122, 17.546] + - - [2, 512, 1, 10] + - [120, 0.027] + - - [10, 512, 1, 500] + - [164, 1.602] + - - [4, 1024, 1, 2000] + - [148, 1.529] + - - [256, 100, 1, 2000] + - [148, 10.007] + - - [100, 100, 1, 10] + - [129, 0.28] + - - [128, 512, 1, 10] + - [129, 1.719] + - - [256, 100, 1, 500] + - [148, 8.044] + - - [64, 100, 1, 512] + - [140, 2.044] + - - [64, 512, 1, 500] + - [148, 10.043] + - - [8, 100, 1, 512] + - [140, 0.253] + - - [32, 100, 1, 500] + - [140, 1.006] + - - [32, 500, 1, 2048] + - [171, 6.501] + - - [128, 500, 1, 2000] + - [143, 17.117] + - - [8, 1024, 1, 10] + - [129, 0.221] + - - [2, 500, 1, 100] + - [134, 0.149] + - - [10, 500, 1, 512] + - [171, 1.597] + - - [32, 500, 1, 500] + - [148, 5.085] + - - [100, 500, 1, 100] + - [132, 7.007] + - - [10, 1024, 1, 512] + - [140, 3.19] + - - [512, 100, 1, 512] + - [143, 12.069] + - - [4, 500, 1, 500] + - [148, 0.636] + - - [64, 100, 1, 1024] + - [171, 2.382] + - - [2, 500, 1, 2000] + - [127, 0.397] + - - [32, 512, 1, 2048] + - [171, 6.709] + - - [10, 100, 1, 2000] + - [127, 0.402] + - - [4, 100, 1, 512] + - [140, 0.126] + - - [2, 512, 1, 2048] + - [140, 0.415] + - - [100, 100, 1, 2000] + - [127, 4.015] + - - [10, 500, 1, 500] + - [127, 1.579] + - - [2, 100, 1, 2048] + - [127, 0.081] + - - [32, 100, 1, 2048] + - [171, 1.304] + - - [16, 100, 1, 1024] + - [140, 0.596] + - - [2, 500, 1, 10] + - [120, 0.027] + - - [500, 100, 1, 2048] + - [143, 13.089] + - - [16, 1024, 1, 2000] + - [148, 6.109] + - - [10, 1024, 1, 1024] + - [157, 3.632] + - - [500, 100, 1, 512] + - [152, 11.667] + - - [32, 512, 1, 500] + - [164, 5.161] + - - [100, 500, 1, 512] + - [152, 11.712] + - - [8, 500, 1, 2000] + - [148, 1.584] + - - [4, 100, 1, 1024] + - [127, 0.149] + - - [2, 500, 1, 1024] + - [140, 0.374] + - - [100, 500, 1, 1024] + - [152, 12.664] + - - [32, 100, 1, 1024] + - [127, 1.196] + - - [64, 100, 1, 2000] + - [171, 2.572] + - - [64, 500, 1, 10] + - [129, 0.893] + - - [64, 500, 1, 512] + - [157, 9.678] + - - [10, 100, 1, 1024] + - [140, 0.374] + - - [16, 512, 1, 100] + - [164, 1.209] + - - [4, 100, 1, 2000] + - [127, 0.158] + - - [2, 512, 1, 1024] + - [140, 0.379] + - - [64, 512, 1, 1024] + - [157, 11.717] + - - [512, 100, 1, 2048] + - [143, 13.72] + - - [32, 100, 1, 2000] + - [140, 1.277] + - - [4, 512, 1, 500] + - [127, 0.641] + - - [4, 500, 1, 1024] + - [140, 0.744] + - - [32, 100, 1, 10] + - [129, 0.09] + - - [10, 1024, 1, 2048] + - [171, 3.885] + - - [8, 500, 1, 100] + - [126, 0.587] + - - [200, 100, 1, 1024] + - [164, 7.232] + - - [16, 100, 1, 100] + - [127, 0.23] + - - [8, 1024, 1, 2000] + - [148, 3.054] + - - [4, 512, 1, 100] + - [127, 0.302] + - - [16, 500, 1, 100] + - [164, 1.178] + - - [8, 1024, 1, 2048] + - [157, 3.104] + - - [16, 1024, 1, 2048] + - [157, 6.181] + - - [64, 512, 1, 100] + - [132, 4.927] + - - [2, 100, 1, 500] + - [127, 0.063] + - - [2, 500, 1, 512] + - [157, 0.32] + - - [128, 500, 1, 1024] + - [167, 16.265] + - - [10, 100, 1, 10] + - [120, 0.027] + - - [64, 1024, 1, 10] + - [120, 1.805] + - - [500, 100, 1, 500] + - [152, 11.699] + - - [2, 512, 1, 100] + - [120, 0.149] + - - [16, 100, 1, 500] + - [140, 0.501] + - - [128, 100, 1, 500] + - [157, 4.101] + - - [512, 100, 1, 1024] + - [161, 13.025] + - - [16, 100, 1, 2000] + - [127, 0.636] + - - [10, 512, 1, 100] + - [148, 0.749] + - - [8, 512, 1, 100] + - [127, 0.605] + - - [128, 100, 1, 2000] + - [127, 5.179] + - - [2, 1024, 1, 2000] + - [148, 0.767] + - - [100, 512, 1, 512] + - [152, 11.97] + - - [32, 1024, 1, 2000] + - [148, 12.258] + - - [128, 500, 1, 100] + - [124, 8.857] + - - [100, 100, 1, 100] + - [132, 1.493] + - - [8, 512, 1, 1024] + - [157, 1.52] + - - [200, 100, 1, 500] + - [148, 6.267] + - - [2, 1024, 1, 2048] + - [140, 0.772] + - - [512, 100, 1, 2000] + - [161, 13.634] + - - [16, 512, 1, 2000] + - [127, 3.244] + - - [64, 500, 1, 1024] + - [157, 11.234] + - - [10, 512, 1, 1024] + - [140, 1.908] + - - [512, 100, 1, 100] + - [161, 7.129] + - - [8, 100, 1, 1024] + - [140, 0.298] + - - [10, 100, 1, 100] + - [127, 0.144] + - - [10, 500, 1, 2000] + - [171, 1.981] + - - [500, 100, 1, 2000] + - [161, 13.404] + - - [100, 512, 1, 2000] + - [167, 13.454] + - - [64, 1024, 1, 512] + - [134, 15.353] + - - [32, 500, 1, 100] + - [132, 2.423] + - - [10, 100, 1, 2048] + - [140, 0.406] + - - [64, 100, 1, 100] + - [132, 0.97] + - - [2, 1024, 1, 100] + - [120, 0.298] + - - [64, 500, 1, 2000] + - [148, 12.042] + - - [8, 512, 1, 512] + - [140, 1.29] + - - [8, 512, 1, 2048] + - [157, 1.665] + - - [100, 100, 1, 1024] + - [171, 3.763] + - - [8, 100, 1, 2000] + - [140, 0.32] + - - [2, 1024, 1, 1024] + - [171, 0.726] + - - [16, 512, 1, 512] + - [148, 2.594] + - - [32, 500, 1, 512] + - [140, 5.022] + - - [32, 500, 1, 1024] + - [171, 5.96] + - - [32, 500, 1, 10] + - [129, 0.447] + - - [4, 1024, 1, 500] + - [164, 1.241] + - - [256, 100, 1, 512] + - [148, 8.121] + - - [8, 1024, 1, 500] + - [148, 2.477] + - - [4, 1024, 1, 100] + - [127, 0.591] + - - [100, 500, 1, 500] + - [134, 11.464] + - - [2, 1024, 1, 500] + - [127, 0.618] + - - [64, 100, 1, 500] + - [140, 2.035] + - - [2, 512, 1, 500] + - [127, 0.32] + - - [10, 1024, 1, 500] + - [127, 3.104] + - - [128, 500, 1, 512] + - [167, 14.902] + - - [10, 500, 1, 2048] + - [157, 2.035] + - - [128, 512, 1, 512] + - [152, 15.322] + - - [64, 512, 1, 10] + - [129, 0.947] + - - [32, 500, 1, 2000] + - [171, 6.362] + - - [100, 100, 1, 2048] + - [171, 4.088] + - - [200, 100, 1, 512] + - [148, 6.362] + - - [200, 100, 1, 2048] + - [148, 7.796] + - - [8, 100, 1, 10] + - [120, 0.023] + - - [100, 100, 1, 500] + - [157, 3.194] + - - [100, 500, 1, 10] + - [129, 1.313] + - - [10, 500, 1, 1024] + - [140, 1.868] + - - [256, 100, 1, 10] + - [129, 0.713] + - - [10, 512, 1, 2048] + - [157, 2.08] + - - [2, 1024, 1, 512] + - [157, 0.645] + - - [4, 500, 1, 2048] + - [171, 0.817] + - - [100, 512, 1, 100] + - [132, 7.174] + - - [16, 500, 1, 512] + - [140, 2.545] + - - [10, 1024, 1, 100] + - [148, 1.48] + - - [8, 1024, 1, 100] + - [127, 1.191] + - - [64, 1024, 1, 2000] + - [122, 17.537] + - - [10, 100, 1, 512] + - [140, 0.316] + - - [4, 500, 1, 2000] + - [148, 0.794] + - - [4, 100, 1, 100] + - [120, 0.059] + - - [32, 1024, 1, 512] + - [140, 10.341] + - - [8, 512, 1, 2000] + - [127, 1.62] + - - [100, 100, 1, 512] + - [157, 3.199] + - - [2, 512, 1, 2000] + - [127, 0.406] + - - [16, 500, 1, 10] + - [126, 0.221] + - - [10, 500, 1, 100] + - [148, 0.726] + - - [4, 100, 1, 500] + - [127, 0.126] + - - [64, 500, 1, 100] + - [132, 4.751] + - - [2, 100, 1, 100] + - [120, 0.032] + - - [10, 512, 1, 2000] + - [148, 2.03] + - - [8, 500, 1, 500] + - [127, 1.259] + - - [4, 500, 1, 512] + - [157, 0.636] + - - [10, 500, 1, 10] + - [126, 0.135] + - - [64, 512, 1, 2000] + - [148, 12.506] + - - [32, 512, 1, 2000] + - [127, 6.515] + - - [128, 500, 1, 2048] + - [134, 16.707] + - - [4, 512, 1, 512] + - [140, 0.645] + - - [16, 500, 1, 1024] + - [140, 2.982] + - - [10, 1024, 1, 10] + - [124, 0.28] + - - [16, 500, 1, 500] + - [148, 2.499] + - - [500, 100, 1, 1024] + - [134, 12.57] + - - [16, 100, 1, 512] + - [140, 0.51] + - - [64, 512, 1, 2048] + - [157, 12.47] + - - [32, 1024, 1, 10] + - [129, 0.925] + - - [8, 1024, 1, 512] + - [157, 2.549] + - - [4, 1024, 1, 2048] + - [171, 1.557] + - - [128, 500, 1, 500] + - [122, 14.979] + - - [100, 512, 1, 1024] + - [167, 12.953] + - - [16, 1024, 1, 500] + - [148, 4.954] + - - [128, 100, 1, 2048] + - [127, 5.274] + - - [100, 512, 1, 500] + - [152, 11.74] + - - [8, 1024, 1, 1024] + - [157, 2.901] + - - [4, 500, 1, 10] + - [122, 0.054] + - - [128, 500, 1, 10] + - [129, 1.696] + - - [32, 1024, 1, 100] + - [132, 4.895] + - - [8, 500, 1, 2048] + - [171, 1.629] + - - [16, 1024, 1, 1024] + - [171, 5.798] + - - [200, 100, 1, 10] + - [129, 0.555] + - - [512, 100, 1, 500] + - [143, 12.055] + - - [4, 500, 1, 100] + - [126, 0.293] + - - [8, 100, 1, 2048] + - [140, 0.325] + - - [512, 100, 1, 10] + - [129, 1.376] + - - [4, 512, 1, 1024] + - [157, 0.758] + - - [32, 1024, 1, 2048] + - [171, 12.43] + - - [128, 100, 1, 512] + - [127, 4.106] + - - [32, 1024, 1, 500] + - [148, 9.935] + - - [4, 1024, 1, 10] + - [129, 0.117] + - - [100, 512, 1, 10] + - [124, 1.344] + - - [8, 100, 1, 100] + - [127, 0.117] + - - [128, 512, 1, 500] + - [122, 15.43] + - - [16, 100, 1, 2048] + - [140, 0.654] + - - [2, 1024, 1, 10] + - [120, 0.054] + - - [4, 100, 1, 2048] + - [127, 0.162] + - - [4, 512, 1, 2000] + - [157, 0.812] + - - [1024, 29, 1, 1024] + - [127, 10.363] + - - [1024, 1, 1, 21] + - [120, 0.05] + - - [1024, 49, 1, 1024] + - [161, 12.412] + - - [1024, 35, 1, 1024] + - [148, 10.891] + - - [1024, 24, 1, 1024] + - [127, 8.604] + - - [1024, 21, 1, 1024] + - [127, 7.526] + - - [1024, 1, 1, 14] + - [129, 0.041] + - - [1024, 91, 1, 1024] + - [123, 18.367] + - - [1024, 14, 1, 1024] + - [148, 5.225] + - - [1024, 25, 1, 1024] + - [127, 8.947] + - - [1024, 27, 1, 1024] + - [127, 9.664] + - - [1024, 50, 1, 1024] + - [161, 12.637] + - - [1024, 64, 1, 1024] + - [122, 16.138] + - - [1024, 13, 1, 1024] + - [164, 4.859] + - - [1024, 63, 1, 1024] + - [161, 15.872] + - - [1024, 86, 1, 1024] + - [123, 17.388] + - - [1024, 1, 1, 13] + - [126, 0.036] + - - [289, 192, 1, 1344] + - [161, 14.465] + - - [196, 128, 1, 800] + - [148, 8.807] + - - [64, 512, 1, 1344] + - [127, 12.146] + - - [289, 224, 1, 1568] + - [122, 16.567] + - - [64, 256, 1, 1536] + - [157, 6.547] + - - [289, 160, 1, 1120] + - [148, 14.059] + - - [64, 256, 1, 1152] + - [171, 6.343] + - - [289, 224, 1, 1344] + - [161, 16.382] + - - [289, 192, 1, 896] + - [161, 14.0] + - - [784, 16, 32, 192] + - [122, 17.848] + - - [49, 128, 1, 1200] + - [148, 2.36] + - - [289, 128, 1, 896] + - [148, 11.013] + - - [1001, 32, 1, 1024] + - [127, 11.527] + - - [64, 448, 1, 1152] + - [148, 10.526] + - - [1001, 32, 1, 2048] + - [127, 12.394] + - - [289, 192, 1, 1120] + - [161, 14.275] + - - [64, 320, 1, 1728] + - [127, 7.95] + - - [289, 96, 1, 864] + - [148, 9.727] + - - [196, 64, 1, 800] + - [140, 4.476] + - - [784, 32, 1, 400] + - [127, 7.35] + - - [64, 320, 1, 2880] + - [127, 8.27] + - - [1001, 32, 1, 1536] + - [127, 12.146] + - - [64, 384, 1, 1152] + - [127, 9.032] + - - [64, 192, 1, 1728] + - [148, 4.891] + - - [1001, 64, 1, 1536] + - [143, 16.215] + - - [1001, 64, 1, 2048] + - [122, 16.554] + - - [1024, 64, 1, 4096] + - [286, 25.912] + - - [64, 10, 448, 10] + - [122, 4.372] + - - [64, 18, 648, 18] + - [168, 12.123] + - - [64, 18, 1720, 18] + - [144, 16.355] + - - [64, 19, 1632, 19] + - [123, 16.454] + - - [64, 21, 1472, 21] + - [168, 18.196] + - - [64, 23, 64, 23] + - [161, 4.444] + - - [64, 26, 56, 26] + - [143, 5.013] + - - [1024, 1, 1, 2] + - [120, 0.005] + - - [1024, 1, 1, 1024] + - [148, 0.374] + - - [64, 27, 56, 26] + - [161, 5.306] + - - [64, 17, 1, 17] + - [120, 0.045] + - - [64, 30, 1, 30] + - [120, 0.122] + - - [64, 31, 1, 30] + - [124, 0.131] + - - [64, 31, 1, 31] + - [130, 0.135] + - - [64, 14, 1, 14] + - [171, 0.041] + - - [64, 14, 1, 15] + - [126, 0.036] + - - [64, 15, 1, 15] + - [126, 0.041] + - - [64, 15, 1, 17] + - [120, 0.041] + - - [100, 512, 1, 2048] + - [134, 13.486] + - - [1024, 1, 1, 1600] + - [127, 0.393] + - - [1024, 1, 1, 200] + - [127, 0.221] + - - [1, 200, 1, 1] + - [120, 0.001] + - - [1, 512, 1, 1] + - [161, 0.002] + - - [67, 512, 1, 2048] + - [157, 10.932] + - - [74, 512, 1, 2048] + - [171, 12.073] + - - [64, 3, 512, 3] + - [168, 0.578] + - - [64, 5, 512, 5] + - [120, 1.457] + - - [64, 9, 512, 9] + - [122, 4.047] + - - [64, 512, 1, 512] + - [157, 10.427] + - - [25, 128, 120, 256] + - [156, 16.761] + - - [25, 128, 139, 256] + - [133, 17.988] + - - [25, 128, 160, 256] + - [139, 17.415] + - - [25, 128, 18, 256] + - [152, 11.433] + - - [25, 128, 19, 256] + - [134, 10.481] + - - [9, 128, 120, 256] + - [170, 7.783] + - - [9, 128, 139, 256] + - [138, 7.868] + - - [9, 128, 160, 256] + - [138, 8.126] + - - [9, 128, 18, 256] + - [132, 4.606] + - - [9, 128, 19, 256] + - [132, 4.841] + - - [1, 256, 1, 1152] + - [140, 0.099] + - - [100, 512, 1, 2304] + - [134, 13.576] + - - [25, 256, 1, 1152] + - [140, 2.427] + - - [9, 256, 1, 1152] + - [140, 0.875] + - - [1024, 77, 1, 1024] + - [122, 16.247] + - - [1024, 10, 1, 2] + - [161, 0.063] + - - [1024, 10, 1, 1024] + - [164, 3.763] + - - [1024, 39, 1, 2] + - [143, 0.239] + - - [1024, 39, 1, 1024] + - [148, 12.137] + - - [1024, 40, 1, 2] + - [161, 0.248] + - - [1024, 40, 1, 1024] + - [148, 12.466] + - - [1024, 41, 1, 2] + - [161, 0.253] + - - [1024, 41, 1, 1024] + - [148, 12.71] + - - [1024, 5, 1, 2] + - [120, 0.032] + - - [1024, 5, 1, 1024] + - [148, 1.886] + - - [1024, 8, 1, 2] + - [120, 0.05] + - - [1024, 9, 1, 2] + - [161, 0.059] + - - [1024, 9, 1, 1024] + - [148, 3.388] + - - [64, 4, 32768, 4] + - [123, 2.761] + - - [64, 4, 38400, 4] + - [123, 2.775] + - - [64, 14, 10880, 14] + - [161, 16.68] + - - [64, 14, 10880, 15] + - [161, 16.996] + - - [64, 15, 7680, 15] + - [161, 17.781] + - - [64, 15, 10880, 15] + - [161, 18.115] + - - [64, 15, 7680, 17] + - [161, 17.474] + - - [64, 17, 6144, 17] + - [123, 17.875] + - - [64, 17, 7680, 17] + - [144, 18.205] + - - [64, 17, 6144, 21] + - [162, 19.391] + - - [64, 21, 6144, 21] + - [162, 23.637] + - - [64, 24, 4736, 24] + - [153, 26.308] + - - [64, 24, 4736, 34] + - [135, 27.034] + - - [64, 30, 2048, 30] + - [135, 28.582] + - - [64, 31, 2048, 30] + - [135, 28.645] + - - [64, 31, 2048, 31] + - [162, 28.613] + - - [128, 128, 1, 64] + - [132, 2.075] + - - [64, 5, 1, 5] + - [120, 0.005] + - - [32, 33, 1, 33] + - [124, 0.081] + - - [64, 5, 960, 5] + - [120, 2.026] + - - [74, 960, 1, 2048] + - [168, 15.132] + - - [128, 27, 32768, 27] + - [142, 10.594] + - - [1024, 16, 1, 1024] + - [171, 5.725] + - - [1024, 16, 1, 2] + - [129, 0.099] + - - [1024, 64, 1, 2] + - [143, 0.383] + - - [1024, 80, 1, 2] + - [143, 0.474] + - - [1024, 82, 1, 1024] + - [123, 16.382] + - - [1024, 82, 1, 2] + - [143, 0.492] + - - [1024, 12, 1, 1024] + - [148, 4.476] + - - [1024, 12, 1, 2] + - [120, 0.072] + - - [64, 24, 6816, 24] + - [168, 26.953] + - - [64, 26, 6272, 26] + - [135, 28.658] + - - [196, 256, 1, 2304] + - [126, 13.725] + - - [850, 3, 2, 256] + - [148, 1.281] + - - [850, 12, 2, 256] + - [148, 5.035] + - - [805, 12, 2, 256] + - [148, 4.769] + - - [805, 3, 2, 256] + - [148, 1.2] + - - [768, 3, 2, 256] + - [148, 1.132] + - - [768, 12, 2, 256] + - [148, 4.548] + - - [864, 12, 2, 256] + - [148, 5.116] + - - [864, 3, 2, 256] + - [148, 1.29] + - - [247, 3, 2, 256] + - [148, 0.37] + - - [216, 3, 2, 256] + - [126, 0.32] + - - [950, 3, 2, 256] + - [148, 1.412] + - - [187, 12, 2, 256] + - [148, 1.114] + - - [176, 12, 2, 256] + - [148, 1.056] + - - [247, 12, 2, 256] + - [127, 1.462] + - - [187, 3, 2, 256] + - [148, 0.28] + - - [228, 12, 2, 256] + - [148, 1.349] + - - [221, 12, 2, 256] + - [127, 1.308] + - - [176, 3, 2, 256] + - [148, 0.266] + - - [950, 12, 2, 256] + - [148, 5.626] + - - [192, 12, 2, 256] + - [148, 1.146] + - - [228, 3, 2, 256] + - [127, 0.338] + - - [221, 3, 2, 256] + - [126, 0.329] + - - [192, 3, 2, 256] + - [148, 0.289] + - - [216, 12, 2, 256] + - [127, 1.286] + - - [2, 6, 1, 1024] + - [120, 0.005] + - - [1024, 20, 1, 2] + - [120, 0.126] + - - [64, 128, 768, 128] + - [217, 54.456] + - - [64, 128, 3072, 128] + - [219, 30.384] + - - [64, 256, 12, 256] + - [220, 13.326] + - - [64, 256, 16, 256] + - [221, 19.384] + - - [64, 256, 24, 256] + - [221, 25.63] + - - [64, 256, 32, 256] + - [222, 29.226] + - - [64, 256, 48, 256] + - [217, 37.879] + - - [64, 256, 64, 256] + - [223, 45.09] + - - [64, 256, 128, 256] + - [217, 52.113] + - - [64, 256, 256, 256] + - [224, 58.48] + - - [64, 256, 384, 256] + - [225, 50.421] + - - [64, 256, 512, 256] + - [226, 36.215] + - - [64, 256, 768, 256] + - [227, 36.393] + - - [64, 256, 1024, 256] + - [228, 36.692] + - - [64, 256, 1536, 256] + - [226, 36.911] + - - [64, 256, 2048, 256] + - [227, 37.053] + - - [64, 256, 3072, 256] + - [226, 37.126] + - - [64, 384, 12, 384] + - [229, 23.267] + - - [64, 384, 16, 384] + - [221, 30.197] + - - [64, 384, 24, 384] + - [224, 38.187] + - - [64, 384, 32, 384] + - [224, 40.05] + - - [64, 384, 48, 384] + - [224, 57.798] + - - [64, 384, 64, 384] + - [224, 57.684] + - - [64, 384, 96, 384] + - [217, 56.332] + - - [64, 384, 128, 384] + - [230, 59.628] + - - [64, 384, 256, 384] + - [231, 40.186] + - - [64, 384, 384, 384] + - [232, 40.452] + - - [64, 384, 512, 384] + - [233, 41.027] + - - [64, 384, 768, 384] + - [234, 41.181] + - - [64, 384, 1024, 384] + - [235, 41.494] + - - [64, 384, 1536, 384] + - [233, 41.665] + - - [64, 384, 2048, 384] + - [234, 41.662] + - - [64, 384, 3072, 384] + - [233, 41.743] + - - [64, 512, 12, 512] + - [221, 31.649] + - - [64, 512, 24, 512] + - [217, 40.532] + - - [64, 512, 32, 512] + - [230, 55.1] + - - [64, 512, 384, 512] + - [233, 44.726] + - - [64, 512, 512, 512] + - [234, 44.823] + - - [64, 512, 768, 512] + - [234, 45.211] + - - [64, 512, 1024, 512] + - [232, 45.193] + - - [64, 512, 1536, 512] + - [232, 45.441] + - - [64, 512, 2048, 512] + - [236, 45.372] + - - [64, 512, 3072, 512] + - [233, 45.41] + - - [128, 128, 12, 64] + - [237, 5.501] + - - [128, 128, 24, 64] + - [238, 10.631] + - - [128, 128, 48, 64] + - [239, 17.994] + - - [128, 128, 96, 64] + - [240, 35.874] + - - [128, 128, 384, 64] + - [241, 58.527] + - - [256, 256, 12, 64] + - [242, 23.459] + - - [256, 256, 24, 64] + - [242, 30.896] + - - [256, 256, 48, 64] + - [243, 50.407] + - - [256, 256, 96, 64] + - [242, 65.207] + - - [256, 256, 192, 64] + - [244, 74.059] + - - [256, 256, 384, 64] + - [245, 71.325] + - - [384, 384, 12, 64] + - [246, 38.59] + - - [384, 384, 24, 64] + - [247, 54.124] + - - [384, 384, 48, 64] + - [240, 67.901] + - - [384, 384, 96, 64] + - [240, 75.75] + - - [384, 384, 384, 64] + - [248, 55.544] + - - [512, 512, 12, 64] + - [245, 48.316] + - - [512, 512, 24, 64] + - [242, 65.02] + - - [512, 512, 96, 64] + - [249, 79.365] + - - [512, 512, 384, 64] + - [250, 53.949] + - - [768, 1536, 1, 1] + - [251, 0.657] + - - [768, 2048, 1, 1] + - [252, 0.917] + - - [768, 3072, 1, 1] + - [252, 1.167] + - - [768, 4096, 1, 1] + - [253, 1.354] + - - [768, 6144, 1, 1] + - [254, 1.62] + - - [768, 8192, 1, 1] + - [255, 1.799] + - - [768, 12288, 1, 1] + - [254, 2.029] + - - [768, 16384, 1, 1] + - [256, 2.352] + - - [768, 128, 1, 768] + - [257, 18.196] + - - [768, 256, 1, 768] + - [258, 27.185] + - - [768, 384, 1, 768] + - [259, 38.973] + - - [768, 1536, 1, 768] + - [247, 73.781] + - - [768, 3072, 1, 768] + - [260, 82.045] + - - [768, 6144, 1, 768] + - [261, 87.874] + - - [768, 8192, 1, 768] + - [260, 86.776] + - - [768, 12288, 1, 768] + - [262, 90.788] + - - [768, 16384, 1, 768] + - [263, 89.066] + - - [768, 24576, 1, 768] + - [264, 92.011] + - - [768, 32768, 1, 768] + - [264, 91.518] + - - [768, 49152, 1, 768] + - [265, 92.743] + - - [768, 65536, 1, 768] + - [265, 92.666] + - - [768, 98304, 1, 768] + - [265, 93.635] + - - [768, 131072, 1, 768] + - [265, 93.693] + - - [768, 16, 1, 3072] + - [266, 5.286] + - - [768, 32, 1, 3072] + - [267, 10.651] + - - [768, 64, 1, 3072] + - [268, 18.235] + - - [768, 128, 1, 3072] + - [269, 28.733] + - - [768, 256, 1, 3072] + - [270, 41.975] + - - [768, 384, 1, 3072] + - [271, 52.283] + - - [768, 768, 1, 3072] + - [273, 68.825] + - - [768, 1536, 1, 3072] + - [275, 80.597] + - - [768, 3072, 1, 3072] + - [277, 87.456] + - - [768, 6144, 1, 3072] + - [278, 89.889] + - - [768, 8192, 1, 3072] + - [265, 88.271] + - - [768, 12288, 1, 3072] + - [265, 91.531] + - - [768, 16384, 1, 3072] + - [265, 89.963] + - - [768, 24576, 1, 3072] + - [265, 92.907] + - - [768, 32768, 1, 3072] + - [265, 92.702] + - - [768, 49152, 1, 3072] + - [265, 93.497] + - - [768, 65536, 1, 3072] + - [265, 93.308] + - - [768, 98304, 1, 3072] + - [279, 92.509] + - - [768, 131072, 1, 3072] + - [280, 93.165] + - - [1024, 384, 1, 1024] + - [281, 41.706] + - - [1024, 768, 1, 1024] + - [282, 61.981] + - - [1024, 24576, 1, 1024] + - [265, 91.91] + - - [1024, 49152, 1, 1024] + - [283, 92.925] + - - [1024, 65536, 1, 1024] + - [283, 93.434] + - - [1024, 16, 1, 4096] + - [284, 8.188] + - - [1024, 32, 1, 4096] + - [285, 15.68] + - - [1024, 128, 1, 4096] + - [287, 38.509] + - - [1024, 384, 1, 4096] + - [289, 56.339] + - - [1024, 768, 1, 4096] + - [272, 72.464] + - - [1024, 1536, 1, 4096] + - [290, 79.945] + - - [1024, 24576, 1, 4096] + - [265, 93.005] + - - [1024, 49152, 1, 4096] + - [283, 92.17] + - - [1024, 65536, 1, 4096] + - [283, 93.146] + - - [2304, 16, 1, 1] + - [292, 0.02] + - - [2304, 32, 1, 1] + - [293, 0.051] + - - [2304, 64, 1, 1] + - [294, 0.099] + - - [2304, 128, 1, 1] + - [295, 0.179] + - - [2304, 256, 1, 1] + - [296, 0.324] + - - [2304, 384, 1, 1] + - [297, 0.436] + - - [2304, 512, 1, 1] + - [298, 0.533] + - - [2304, 768, 1, 1] + - [299, 0.944] + - - [2304, 1024, 1, 1] + - [255, 1.167] + - - [2304, 1536, 1, 1] + - [300, 1.431] + - - [2304, 2048, 1, 1] + - [255, 1.645] + - - [2304, 3072, 1, 1] + - [253, 1.867] + - - [2304, 4096, 1, 1] + - [255, 2.031] + - - [2304, 6144, 1, 1] + - [301, 2.239] + - - [2304, 8192, 1, 1] + - [302, 2.351] + - - [2304, 12288, 1, 1] + - [257, 2.463] + - - [2304, 16384, 1, 1] + - [303, 2.133] + - - [2304, 24576, 1, 1] + - [303, 1.848] + - - [2304, 32768, 1, 1] + - [304, 1.616] + - - [2304, 49152, 1, 1] + - [304, 1.305] + - - [2304, 65536, 1, 1] + - [304, 1.189] + - - [2304, 98304, 1, 1] + - [305, 1.079] + - - [2304, 131072, 1, 1] + - [306, 1.009] + - - [2304, 16, 1, 768] + - [307, 6.758] + - - [2304, 32, 1, 768] + - [308, 13.764] + - - [2304, 64, 1, 768] + - [309, 25.269] + - - [2304, 128, 1, 768] + - [310, 37.404] + - - [2304, 256, 1, 768] + - [311, 56.426] + - - [2304, 384, 1, 768] + - [242, 54.646] + - - [2304, 512, 1, 768] + - [312, 74.264] + - - [2304, 768, 1, 768] + - [313, 78.224] + - - [2304, 1024, 1, 768] + - [314, 82.26] + - - [2304, 1536, 1, 768] + - [261, 85.668] + - - [2304, 2048, 1, 768] + - [261, 87.686] + - - [2304, 3072, 1, 768] + - [314, 89.776] + - - [2304, 4096, 1, 768] + - [263, 90.783] + - - [2304, 6144, 1, 768] + - [263, 91.867] + - - [2304, 8192, 1, 768] + - [262, 92.381] + - - [2304, 12288, 1, 768] + - [264, 92.918] + - - [2304, 16384, 1, 768] + - [264, 93.227] + - - [2304, 24576, 1, 768] + - [264, 93.687] + - - [2304, 32768, 1, 768] + - [264, 93.824] + - - [2304, 49152, 1, 768] + - [264, 93.971] + - - [2304, 65536, 1, 768] + - [264, 93.953] + - - [2304, 98304, 1, 768] + - [264, 94.169] + - - [2304, 131072, 1, 768] + - [265, 93.104] + - - [3072, 16, 1, 1] + - [292, 0.026] + - - [3072, 32, 1, 1] + - [315, 0.069] + - - [3072, 64, 1, 1] + - [316, 0.129] + - - [3072, 128, 1, 1] + - [317, 0.226] + - - [3072, 256, 1, 1] + - [318, 0.441] + - - [3072, 384, 1, 1] + - [296, 0.595] + - - [3072, 512, 1, 1] + - [319, 0.876] + - - [3072, 768, 1, 1] + - [300, 1.183] + - - [3072, 1024, 1, 1] + - [252, 1.354] + - - [3072, 1536, 1, 1] + - [300, 1.633] + - - [3072, 2048, 1, 1] + - [320, 1.79] + - - [3072, 3072, 1, 1] + - [321, 2.051] + - - [3072, 4096, 1, 1] + - [321, 2.221] + - - [3072, 6144, 1, 1] + - [257, 2.408] + - - [3072, 8192, 1, 1] + - [322, 2.494] + - - [3072, 12288, 1, 1] + - [323, 2.095] + - - [3072, 16384, 1, 1] + - [304, 1.87] + - - [3072, 24576, 1, 1] + - [304, 1.595] + - - [3072, 32768, 1, 1] + - [304, 1.375] + - - [3072, 49152, 1, 1] + - [304, 1.188] + - - [3072, 65536, 1, 1] + - [304, 1.114] + - - [3072, 16, 1, 768] + - [324, 8.55] + - - [3072, 32, 1, 768] + - [325, 17.486] + - - [3072, 64, 1, 768] + - [326, 26.842] + - - [3072, 128, 1, 768] + - [259, 39.561] + - - [3072, 256, 1, 768] + - [240, 61.1] + - - [3072, 384, 1, 768] + - [275, 73.384] + - - [3072, 768, 1, 768] + - [260, 82.293] + - - [3072, 1536, 1, 768] + - [260, 87.658] + - - [3072, 3072, 1, 768] + - [261, 90.742] + - - [3072, 6144, 1, 768] + - [260, 92.302] + - - [3072, 8192, 1, 768] + - [263, 92.218] + - - [3072, 12288, 1, 768] + - [263, 93.224] + - - [3072, 24576, 1, 768] + - [263, 93.892] + - - [3072, 32768, 1, 768] + - [264, 93.943] + - - [3072, 49152, 1, 768] + - [263, 94.04] + - - [3072, 65536, 1, 768] + - [263, 94.186] + - - [3072, 98304, 1, 768] + - [263, 93.271] + - - [3072, 131072, 1, 768] + - [263, 93.607] + - - [3072, 256, 1, 1024] + - [328, 62.752] + - - [3072, 384, 1, 1024] + - [275, 75.297] + - - [3072, 768, 1, 1024] + - [263, 82.826] + - - [3072, 1024, 1, 1024] + - [329, 80.294] + - - [3072, 1536, 1, 1024] + - [330, 88.201] + - - [3072, 6144, 1, 1024] + - [263, 92.469] + - - [3072, 8192, 1, 1024] + - [262, 92.107] + - - [3072, 12288, 1, 1024] + - [262, 93.035] + - - [3072, 16384, 1, 1024] + - [265, 92.859] + - - [3072, 24576, 1, 1024] + - [283, 93.52] + - - [3072, 32768, 1, 1024] + - [283, 93.813] + - - [3072, 49152, 1, 1024] + - [280, 94.008] + - - [3072, 65536, 1, 1024] + - [283, 92.944] + - - [4096, 16, 1, 1024] + - [331, 11.504] + - - [4096, 32, 1, 1024] + - [227, 24.696] + - - [4096, 64, 1, 1024] + - [259, 37.871] + - - [4096, 128, 1, 1024] + - [282, 52.316] + - - [4096, 384, 1, 1024] + - [278, 71.367] + - - [4096, 768, 1, 1024] + - [249, 80.526] + - - [4096, 1536, 1, 1024] + - [278, 87.272] + - - [4096, 24576, 1, 1024] + - [280, 93.694] + - - [4096, 49152, 1, 1024] + - [280, 92.909] + - - [4096, 65536, 1, 1024] + - [280, 93.326] + - - [1280, 4096, 1, 1280] + - [261, 88.211] + - - [1280, 4096, 1, 5120] + - [283, 89.725] + - - [3840, 4096, 1, 1280] + - [263, 91.523] + - - [5120, 4096, 1, 1280] + - [263, 92.157] + - - [16, 16, 12, 64] + - [332, 0.111] + - - [16, 16, 24, 64] + - [333, 0.209] + - - [16, 16, 48, 64] + - [333, 0.412] + - - [16, 16, 96, 64] + - [332, 0.81] + - - [16, 16, 192, 64] + - [334, 1.56] + - - [16, 16, 384, 64] + - [335, 2.648] + - - [32, 32, 12, 64] + - [334, 0.411] + - - [32, 32, 24, 64] + - [334, 0.889] + - - [32, 32, 48, 64] + - [334, 1.787] + - - [32, 32, 96, 64] + - [336, 3.363] + - - [32, 32, 192, 64] + - [336, 5.976] + - - [32, 32, 384, 64] + - [337, 8.473] + - - [64, 16, 12, 16] + - [336, 0.116] + - - [64, 16, 16, 16] + - [336, 0.159] + - - [64, 16, 24, 16] + - [338, 0.233] + - - [64, 16, 32, 16] + - [338, 0.304] + - - [64, 16, 48, 16] + - [339, 0.453] + - - [64, 16, 64, 16] + - [338, 0.586] + - - [64, 16, 96, 16] + - [340, 0.853] + - - [64, 16, 128, 16] + - [340, 1.09] + - - [64, 16, 192, 16] + - [340, 1.507] + - - [64, 16, 384, 16] + - [336, 2.414] + - - [64, 16, 512, 16] + - [334, 2.911] + - - [64, 16, 768, 16] + - [341, 3.836] + - - [64, 16, 1024, 16] + - [342, 4.453] + - - [64, 16, 1536, 16] + - [343, 5.621] + - - [64, 16, 2048, 16] + - [341, 6.318] + - - [64, 16, 3072, 16] + - [341, 8.288] + - - [64, 32, 12, 32] + - [344, 0.516] + - - [64, 32, 16, 32] + - [336, 0.639] + - - [64, 32, 24, 32] + - [344, 0.964] + - - [64, 32, 32, 32] + - [344, 1.268] + - - [64, 32, 48, 32] + - [332, 1.863] + - - [64, 32, 64, 32] + - [345, 2.408] + - - [64, 32, 96, 32] + - [346, 3.42] + - - [64, 32, 128, 32] + - [336, 4.291] + - - [64, 32, 192, 32] + - [345, 5.711] + - - [64, 32, 256, 32] + - [347, 6.771] + - - [64, 32, 384, 32] + - [336, 8.857] + - - [64, 32, 512, 32] + - [348, 10.039] + - - [64, 32, 768, 32] + - [341, 11.766] + - - [64, 32, 1024, 32] + - [337, 13.199] + - - [64, 32, 1536, 32] + - [309, 17.659] + - - [64, 32, 2048, 32] + - [342, 26.397] + - - [64, 32, 3072, 32] + - [337, 29.359] + - - [64, 64, 12, 64] + - [349, 2.225] + - - [64, 64, 16, 64] + - [344, 2.342] + - - [64, 64, 24, 64] + - [345, 3.462] + - - [64, 64, 32, 64] + - [348, 4.474] + - - [64, 64, 48, 64] + - [332, 6.252] + - - [64, 64, 64, 64] + - [336, 8.018] + - - [64, 64, 128, 64] + - [350, 12.034] + - - [64, 64, 192, 64] + - [346, 15.618] + - - [64, 64, 256, 64] + - [299, 17.916] + - - [64, 64, 384, 64] + - [351, 20.814] + - - [64, 64, 512, 64] + - [351, 22.395] + - - [64, 64, 1024, 64] + - [351, 31.295] + - - [64, 64, 1536, 64] + - [352, 45.439] + - - [64, 64, 2048, 64] + - [353, 47.947] + - - [64, 64, 3072, 64] + - [344, 33.623] + - - [64, 128, 12, 128] + - [335, 6.94] + - - [64, 128, 16, 128] + - [348, 8.157] + - - [64, 128, 24, 128] + - [345, 10.917] + - - [64, 128, 32, 128] + - [351, 12.808] + - - [64, 128, 48, 128] + - [354, 15.902] + - - [64, 128, 64, 128] + - [355, 19.043] + - - [64, 64, 1280, 64] + - [353, 43.881] + - - [768, 1, 1, 1] + - [356, 0.001] + - - [768, 2, 1, 1] + - [357, 0.001] + - - [768, 4, 1, 1] + - [358, 0.002] + - - [768, 8, 1, 1] + - [357, 0.004] + - - [768, 16, 1, 1] + - [359, 0.009] + - - [768, 32, 1, 1] + - [360, 0.018] + - - [768, 64, 1, 1] + - [356, 0.035] + - - [768, 128, 1, 1] + - [359, 0.068] + - - [768, 256, 1, 1] + - [359, 0.126] + - - [768, 384, 1, 1] + - [361, 0.175] + - - [768, 512, 1, 1] + - [359, 0.219] + - - [768, 768, 1, 1] + - [362, 0.313] + - - [768, 1024, 1, 1] + - [363, 0.383] + - - [1024, 1, 1, 1] + - [357, 0.001] + - - [1024, 2, 1, 1] + - [357, 0.001] + - - [1024, 4, 1, 1] + - [364, 0.003] + - - [1024, 8, 1, 1] + - [365, 0.006] + - - [1024, 16, 1, 1] + - [359, 0.012] + - - [1024, 32, 1, 1] + - [359, 0.023] + - - [1024, 64, 1, 1] + - [359, 0.046] + - - [1024, 128, 1, 1] + - [359, 0.093] - null -- DeviceEfficiency diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB_GB.yaml new file mode 100644 index 000000000..b11be00a0 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Ailk_Bljk_SB_GB.yaml @@ -0,0 +1,58188 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi31 +- gfx1100 +- [Device 744c] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 17894.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20129.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 17109.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19141.0] + - - [768, 4096, 1, 2, 768, 768, 768, 2] + - [29, 894.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [35, 18045.0] + - - [3072, 4096, 1, 768, 3072, 3072, 3072, 768] + - [7, 19786.0] + - - [768, 2048, 1, 2, 768, 768, 768, 2] + - [6, 753.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [16, 16363.0] + - - [3072, 2048, 1, 768, 3072, 3072, 3072, 768] + - [1, 19360.0] + - - [3072, 1024, 1, 768, 3072, 3072, 3072, 768] + - [29, 18143.0] + - - [3072, 512, 1, 768, 3072, 3072, 3072, 768] + - [31, 16377.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18097.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 19398.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 20213.0] + - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] + - [16, 16452.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 19746.0] + - - [1024, 2048, 1, 2, 1024, 1024, 1024, 2] + - [6, 900.0] + - - [1024, 3072, 1, 2, 1024, 1024, 1024, 2] + - [6, 999.0] + - - [1024, 4096, 1, 2, 1024, 1024, 1024, 2] + - [6, 1036.0] + - - [128, 128, 512, 64, 128, 128, 128, 64] + - [12, 15516.0] + - - [512, 512, 64, 64, 512, 512, 512, 64] + - [6, 18380.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 19908.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [1, 18917.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [29, 18628.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [23, 19979.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [1, 18426.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 19078.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [22, 18059.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 128] + - [6, 17672.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [30, 20051.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 19097.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 20649.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 128] + - [30, 18359.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 18179.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [39, 20285.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 19158.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [29, 18282.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [29, 16336.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [29, 18219.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 19978.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [10, 13892.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 128] + - [14, 15400.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [31, 18441.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [30, 18252.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [6, 15957.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 128] + - [0, 15204.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [36, 19928.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [23, 16929.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [0, 17409.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [35, 14706.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [29, 18537.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [14, 18240.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [29, 17436.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20503.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [15, 19377.0] + - - [704, 5056, 1, 128, 704, 704, 704, 128] + - [29, 15167.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 18482.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [15, 20591.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [36, 18640.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 19905.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [35, 17826.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [14, 17117.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 20050.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [36, 19595.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [30, 20653.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [14, 17635.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [15, 19796.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [1, 19172.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [14, 17872.0] + - - [448, 5888, 1, 128, 448, 448, 448, 128] + - [29, 14033.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 18945.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [22, 14360.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [1, 20251.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [29, 15653.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [0, 18224.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 128] + - [14, 17915.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [0, 17660.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [29, 17531.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [14, 18391.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [23, 19469.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [23, 16315.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 128] + - [7, 19512.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 20613.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [29, 17775.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [35, 14099.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [22, 14953.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [36, 18439.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [29, 18470.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [0, 15090.0] + - - [704, 2944, 1, 128, 704, 704, 704, 128] + - [6, 13890.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [29, 17533.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [7, 16666.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [14, 17755.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 128] + - [6, 17762.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [23, 19995.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [6, 16831.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [30, 19959.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [30, 18970.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 19488.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 20397.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [7, 20006.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [16, 16224.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [15, 17310.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [30, 20395.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [7, 18854.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 20838.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 19956.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [14, 18092.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 128] + - [36, 19244.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [36, 17488.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 128] + - [14, 17809.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 128] + - [0, 17659.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [36, 20532.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [30, 19270.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [2, 17347.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 128] + - [29, 18218.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [0, 16918.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [5, 17679.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [36, 19579.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [36, 18694.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [22, 17845.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [27, 19955.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [23, 18930.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [29, 17120.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [14, 17744.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [23, 19532.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 18788.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [7, 19492.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 128] + - [29, 15719.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [36, 20365.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [23, 18110.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 20664.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 19360.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [22, 15407.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [29, 17521.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 20159.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 128] + - [1, 18985.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 128] + - [35, 17354.0] + - - [448, 3584, 1, 128, 448, 448, 448, 128] + - [14, 12441.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 20170.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 128] + - [7, 18274.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [7, 19971.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 128] + - [14, 14879.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 128] + - [14, 14321.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 128] + - [29, 18462.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 128] + - [0, 16592.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 18976.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 19963.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20627.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [29, 16205.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 20012.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [15, 19363.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 20133.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [23, 18994.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [0, 18520.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [36, 18768.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [8, 18329.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [24, 16098.0] + - - [256, 5056, 1, 128, 256, 256, 256, 128] + - [29, 13602.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [7, 18542.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [36, 19192.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 128] + - [14, 17335.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [36, 19053.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [15, 19402.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 20509.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [29, 15901.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [22, 17852.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [15, 18691.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 128] + - [29, 18830.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [0, 17007.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 20105.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [22, 16963.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [4, 20360.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [23, 18784.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 128] + - [35, 18425.0] + - - [448, 6784, 1, 128, 448, 448, 448, 128] + - [6, 14592.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 128] + - [1, 18726.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 20733.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [30, 18947.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [29, 18567.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [30, 20237.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [29, 15446.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [23, 20302.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [29, 16582.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [22, 14610.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [23, 19523.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 128] + - [29, 15530.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [15, 19434.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 128] + - [1, 19683.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 128] + - [29, 14738.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [23, 19212.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [30, 19121.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 18835.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 128] + - [30, 18857.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 19632.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 128] + - [6, 16175.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [23, 19242.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [1, 20172.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 128] + - [35, 19037.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 128] + - [35, 17972.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [29, 17730.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 18944.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 128] + - [1, 19678.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [7, 19623.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 20919.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [35, 17775.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [35, 15558.0] + - - [448, 4288, 1, 128, 448, 448, 448, 128] + - [29, 13234.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [35, 16714.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 128] + - [29, 17914.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [21, 17340.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [36, 19627.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [30, 19330.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [0, 18861.0] + - - [704, 6784, 1, 128, 704, 704, 704, 128] + - [29, 16096.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [15, 19988.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [2, 15808.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 128] + - [29, 18218.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [36, 20032.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [1, 19834.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 128] + - [6, 17159.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 128] + - [7, 19420.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [36, 20003.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [15, 18408.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [29, 17890.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [29, 15931.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [7, 19491.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 20271.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 20821.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [23, 20826.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 128] + - [36, 19509.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [23, 19018.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [30, 19082.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [23, 19058.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [7, 16223.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [7, 19133.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [7, 20031.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 20104.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 20057.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [30, 16872.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 128] + - [14, 17417.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 128] + - [1, 18694.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [15, 18578.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [17, 16708.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [29, 17635.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [36, 18893.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 128] + - [36, 19527.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [29, 16786.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [6, 15086.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 20269.0] + - - [256, 5888, 1, 128, 256, 256, 256, 128] + - [29, 12984.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 19934.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [14, 17968.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [29, 18218.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 128] + - [0, 19019.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [29, 17937.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 128] + - [29, 16084.0] + - - [448, 5056, 1, 128, 448, 448, 448, 128] + - [14, 13251.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [1, 19335.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [7, 19052.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 128] + - [29, 17951.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 128] + - [1, 18963.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [29, 18756.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 20162.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 20857.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 20369.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [35, 17176.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [29, 18767.0] + - - [3025, 256, 64, 64, 3025, 3025, 3025, 64] + - [0, 12124.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [1, 19999.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 20042.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [35, 15773.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [15, 20252.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [15, 20709.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 20092.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [23, 20208.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 128] + - [30, 19203.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20652.0] + - - [704, 3584, 1, 128, 704, 704, 704, 128] + - [14, 14066.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [29, 16937.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [36, 19459.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 128] + - [1, 18458.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 20054.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [15, 19477.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [29, 17926.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 128] + - [1, 18946.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [14, 16989.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 19206.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [23, 16333.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 128] + - [0, 14955.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [1, 18514.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [0, 17382.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 128] + - [30, 19101.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [29, 18141.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [36, 19843.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 18888.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [36, 17982.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [23, 19185.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 128] + - [35, 16635.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 128] + - [6, 17340.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 20269.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 128] + - [29, 16503.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 128] + - [29, 18668.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 20343.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [6, 18120.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [1, 18444.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 19615.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [7, 18659.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 128] + - [0, 18821.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [30, 18766.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 19954.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [16, 15364.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 128] + - [0, 16783.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20098.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 128] + - [29, 18237.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [14, 18547.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 128] + - [29, 14680.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [0, 18094.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [15, 18302.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20187.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [29, 18165.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 20545.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 19862.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [30, 18165.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 19920.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 20642.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 20243.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [11, 18755.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [14, 17092.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [23, 20014.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [30, 20007.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 128] + - [29, 16759.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 20504.0] + - - [704, 2368, 1, 128, 704, 704, 704, 128] + - [14, 12686.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 19080.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [30, 19193.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 19181.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [30, 19545.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [7, 18072.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [29, 17594.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [35, 17685.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [1, 20103.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 18368.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [21, 17047.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [14, 17844.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [7, 20045.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [36, 18750.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [36, 20480.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [35, 19128.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [15, 19280.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 16101.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 128] + - [29, 18763.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [1, 18210.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [23, 16736.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [29, 17731.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [36, 19133.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 20091.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 17778.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [14, 16969.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 19617.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [35, 17303.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [1, 19890.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 128] + - [6, 15658.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [7, 19718.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [1, 20068.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 128] + - [7, 18514.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [36, 18939.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 19046.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 128] + - [6, 14706.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [29, 15686.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [15, 17333.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 18903.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [0, 16672.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [29, 18940.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [21, 17015.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [29, 17052.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [29, 18929.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [23, 20431.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [37, 16228.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [29, 18189.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [7, 18976.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [29, 17292.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [29, 16805.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 128] + - [29, 17041.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 128] + - [14, 18397.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [29, 18257.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [1, 19414.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 128] + - [14, 18540.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [7, 19812.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 128] + - [29, 14293.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [7, 20252.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [29, 15237.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 19494.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [29, 19196.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [15, 19440.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 15924.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 18695.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [14, 15791.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [0, 17077.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [36, 15277.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [5, 17188.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 128] + - [23, 18236.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [14, 17619.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 128] + - [0, 18764.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [15, 18976.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 19555.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 128] + - [7, 19046.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 128] + - [35, 16790.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 20548.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 128] + - [14, 17576.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 128] + - [0, 16869.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 128] + - [29, 16526.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [35, 15709.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 19972.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 128] + - [14, 16526.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [6, 16713.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [29, 17430.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 128] + - [0, 17020.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [23, 16044.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [23, 18918.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [7, 19608.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [35, 14583.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [36, 19850.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 128] + - [29, 17914.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 20340.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [23, 19675.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [0, 17861.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [36, 18661.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 128] + - [29, 17824.0] + - - [256, 6784, 1, 128, 256, 256, 256, 128] + - [0, 15480.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 128] + - [15, 19160.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 128] + - [23, 18622.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 128] + - [0, 17760.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [29, 18441.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [15, 19874.0] + - - [704, 5888, 1, 128, 704, 704, 704, 128] + - [14, 15596.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 128] + - [36, 19468.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [23, 18895.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 20052.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 128] + - [29, 15268.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [14, 18466.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [29, 17609.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [22, 13415.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [14, 16900.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [29, 17929.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [23, 19934.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 17718.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [30, 20181.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [36, 19927.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 128] + - [14, 18110.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [29, 18359.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 20256.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [23, 20609.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 19472.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [36, 16228.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [8, 15344.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 20781.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 128] + - [29, 17819.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 128] + - [7, 18132.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [7, 15982.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [29, 17683.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 19666.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [34, 16006.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [14, 17098.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [22, 14039.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [23, 20377.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [30, 19542.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [15, 20144.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [14, 17059.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [29, 17942.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 128] + - [6, 12532.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [36, 19828.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 128] + - [35, 17998.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [24, 16855.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [14, 16332.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 20051.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [23, 20207.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 20395.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [23, 17192.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [36, 19846.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 18514.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [1, 18694.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [36, 20045.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 19817.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [7, 20053.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 20897.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [36, 18471.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [36, 19651.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [15, 20633.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [1, 20082.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [14, 17328.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 18943.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 128] + - [36, 19026.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 128] + - [14, 16365.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [29, 17638.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 128] + - [1, 19199.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [22, 18373.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 19257.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [14, 17188.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [35, 14688.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 128] + - [23, 18859.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [22, 14498.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 128] + - [6, 13841.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 128] + - [29, 18062.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 128] + - [0, 18523.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [14, 17147.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 128] + - [6, 17396.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [7, 19167.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [29, 18156.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 128] + - [15, 19170.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [36, 18599.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [23, 20530.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [14, 16071.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 128] + - [1, 19364.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [15, 20386.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [6, 17474.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [23, 20613.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 128] + - [29, 15185.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [27, 20522.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 128] + - [29, 17644.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 128] + - [7, 19447.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [36, 19697.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [0, 15779.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [29, 18256.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [15, 19098.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 20258.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [0, 17716.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [23, 19017.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [14, 17773.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [36, 19993.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [23, 20477.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [36, 19118.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [29, 16843.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [21, 16672.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [6, 15770.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [14, 18180.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [14, 18607.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [0, 14233.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [30, 19998.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [36, 19638.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [34, 16876.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 20296.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [1, 18991.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 20751.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [6, 18094.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 128] + - [14, 18084.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [23, 19514.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [1, 19360.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [30, 20450.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [1, 17329.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 128] + - [6, 17691.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 20196.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [29, 17951.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [14, 17704.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 128] + - [29, 15677.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 128] + - [14, 17049.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [29, 18773.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [36, 18999.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 18682.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 20882.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [23, 19587.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 17207.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [36, 18916.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [7, 19600.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [23, 20368.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [30, 19573.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [34, 15541.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 128] + - [0, 17696.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [23, 19466.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [29, 17470.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [22, 15259.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [8, 16137.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 128] + - [6, 15488.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [22, 17398.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 128] + - [1, 18779.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [31, 18230.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 19515.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [23, 18358.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [35, 15868.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 128] + - [29, 18699.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [23, 19174.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [15, 20173.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [14, 15429.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 128] + - [0, 17410.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [36, 19065.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [29, 17487.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 20572.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [15, 18499.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 17086.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 128] + - [14, 16295.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [29, 18079.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 20202.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [29, 17603.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [8, 17956.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [30, 18112.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 128] + - [29, 18753.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [7, 16110.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [23, 20193.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [37, 15719.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 20166.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 128] + - [14, 16129.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 128] + - [0, 15078.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [36, 20392.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [15, 19569.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [36, 18936.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [36, 18330.0] + - - [704, 4288, 1, 128, 704, 704, 704, 128] + - [6, 14647.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [15, 19334.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [14, 16969.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [22, 15973.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 128] + - [14, 18051.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [35, 17767.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 19671.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [15, 19946.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [29, 16935.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 20768.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [7, 16511.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [7, 19174.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [27, 20411.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 128] + - [22, 16453.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [39, 15672.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [0, 15820.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [10, 9356.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [14, 14929.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [39, 15868.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 10386.0] + - - [4096, 512, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 16825.0] + - - [2048, 768, 1, 512, 2048, 2048, 2048, 512] + - [16, 15947.0] + - - [4096, 512, 1, 2048, 4096, 4096, 4096, 2048] + - [34, 17432.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 2048] + - [29, 18029.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [40, 17417.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 2048, 4096] + - [28, 17588.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [29, 17897.0] + - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] + - [21, 16565.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [22, 18086.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] + - [28, 17014.0] + - - [4096, 384, 1, 2048, 4096, 4096, 4096, 2048] + - [7, 16570.0] + - - [1225, 192, 64, 384, 1225, 1225, 1225, 384] + - [0, 17453.0] + - - [289, 128, 64, 1024, 289, 289, 289, 1024] + - [22, 13482.0] + - - [4096, 384, 1, 1536, 4096, 4096, 4096, 1536] + - [1, 16595.0] + - - [289, 192, 64, 1024, 289, 289, 289, 1024] + - [38, 13607.0] + - - [4096, 384, 1, 1280, 4096, 4096, 4096, 1280] + - [36, 16516.0] + - - [4096, 448, 1, 1280, 4096, 4096, 4096, 1280] + - [14, 16483.0] + - - [289, 256, 64, 1024, 289, 289, 289, 1024] + - [11, 14095.0] + - - [4096, 448, 1, 2048, 4096, 4096, 4096, 2048] + - [22, 16502.0] + - - [289, 384, 64, 1024, 289, 289, 289, 1024] + - [11, 14355.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18169.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19141.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19377.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18123.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20080.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19814.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19058.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17773.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18960.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19588.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19431.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18795.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17731.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19570.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19585.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19950.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18458.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18531.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20286.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18664.0] + - - [1024, 3955, 1, 33708, 1024, 1024, 1024, 33708] + - [30, 19480.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 18696.0] + - - [1024, 3906, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19266.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19302.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19906.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18325.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18432.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17934.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18593.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19607.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18146.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17735.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18973.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19569.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18010.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17604.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19656.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20189.0] + - - [1024, 3996, 1, 33708, 1024, 1024, 1024, 33708] + - [0, 18828.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18869.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19063.0] + - - [1024, 3990, 1, 33708, 1024, 1024, 1024, 33708] + - [0, 18800.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18984.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19814.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18034.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19658.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18020.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18229.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19546.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18480.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18178.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19928.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20207.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19082.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17809.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19335.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19703.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18687.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18810.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18987.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19974.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19597.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19222.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17824.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18402.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19762.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18147.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19054.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19798.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19531.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19317.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20065.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18942.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19660.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18313.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19459.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17826.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19721.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19863.0] + - - [1024, 3925, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19361.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18539.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18885.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19764.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20149.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17665.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19954.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18347.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17949.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19051.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17704.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18987.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18128.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19849.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19664.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18958.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19619.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19682.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20053.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18750.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20306.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19290.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17690.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19268.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19658.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19607.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20129.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20089.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18245.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19298.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19509.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17889.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19184.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18700.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19555.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18071.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18926.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20198.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19555.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17544.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19580.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19535.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19510.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17858.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20278.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17929.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19395.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17716.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18625.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19888.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20028.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17522.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19305.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19423.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17705.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19664.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18523.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19823.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20122.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19700.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19621.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19273.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19733.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19698.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19952.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17856.0] + - - [1024, 3859, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 19038.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19321.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17717.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19359.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19869.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19603.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19481.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18553.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18619.0] + - - [1024, 4030, 1, 33708, 1024, 1024, 1024, 33708] + - [0, 18977.0] + - - [1024, 3780, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 18660.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18402.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19921.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19580.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19411.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17597.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19760.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19691.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20257.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18962.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20105.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 18570.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18468.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18155.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18903.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 19069.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19790.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19185.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17540.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18931.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19584.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19460.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19601.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20094.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20258.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18098.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18976.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19404.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19559.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17824.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20034.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18320.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19475.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19536.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17978.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18660.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19684.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19881.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20119.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19745.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18611.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19522.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18904.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18343.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19306.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19745.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20046.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17751.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19164.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19604.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18592.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19832.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20142.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17896.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19542.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19670.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19622.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19634.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18503.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19886.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20288.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19111.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17832.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19419.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19635.0] + - - [1024, 4032, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18947.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19697.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19619.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17894.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18332.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19570.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19180.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17962.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18169.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19862.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19496.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19613.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17915.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17925.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19772.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17623.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19210.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17789.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19522.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18461.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20060.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19621.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17792.0] + - - [1024, 3876, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19132.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19354.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19633.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19871.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18021.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19514.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18701.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20116.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19436.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19473.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18962.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18556.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18498.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19936.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19919.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19687.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20120.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19359.0] + - - [1024, 3894, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19199.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19965.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19505.0] + - - [1024, 4005, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18858.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19443.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19972.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19862.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17697.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19136.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19589.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18725.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18409.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20100.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20035.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19402.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19193.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 19585.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19485.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19494.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18732.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19676.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19903.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19929.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20179.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18812.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19737.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17622.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19551.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17919.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19659.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19954.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18944.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19114.0] + - - [1024, 3681, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18623.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19174.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19583.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19119.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17545.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18908.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19407.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19567.0] + - - [1024, 4050, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 17924.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18606.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19656.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20254.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18249.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19626.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19443.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19714.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20113.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18563.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17869.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19739.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20096.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17835.0] + - - [1024, 3942, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19419.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18882.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19329.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17684.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19539.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19073.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19857.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20183.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17812.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17797.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18933.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17416.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20154.0] + - - [1024, 3822, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 18894.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19969.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19558.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19356.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18924.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18856.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17602.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19854.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19358.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20031.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19468.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18470.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18759.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18506.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19969.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19855.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19180.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17565.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19227.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19436.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18595.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19776.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18754.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19477.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19941.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18099.0] + - - [1024, 3910, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19299.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19013.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19333.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18835.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18102.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19788.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18710.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19976.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18960.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19219.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17910.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19634.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19935.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18615.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17874.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19509.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20229.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19606.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20207.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19691.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20102.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 18927.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 18760.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19203.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19480.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17757.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19061.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17626.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19597.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19593.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19660.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18011.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20140.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19555.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18470.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19296.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20202.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17748.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19841.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19593.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19794.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18024.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19197.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17931.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19115.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19681.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20140.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18815.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19293.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19351.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17691.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18653.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19765.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19702.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19587.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19135.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19377.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19835.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19875.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17599.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18580.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18819.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19858.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 18577.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19591.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19036.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18948.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19902.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19848.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19692.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20096.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19427.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19348.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18199.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19673.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19561.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18368.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19942.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19338.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19497.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19393.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19346.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17604.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19506.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19664.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19770.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18806.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18362.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19352.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20066.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19141.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20185.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18875.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17645.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19673.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19549.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17797.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17877.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18055.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20110.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19276.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19400.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19843.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19596.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19943.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19123.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17552.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19090.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20175.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19270.0] + - - [1024, 4059, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 17982.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18933.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18603.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19724.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20019.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 19581.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19182.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18586.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18289.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18762.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18768.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19609.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19025.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19418.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17735.0] + - - [1024, 3999, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18825.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19700.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19577.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19652.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18887.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20156.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19265.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19229.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17715.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19810.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19612.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17495.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19560.0] + - - [1024, 3976, 1, 33708, 1024, 1024, 1024, 33708] + - [22, 18696.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19379.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19703.0] + - - [1024, 3751, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 18512.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 18189.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19616.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18150.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19972.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19619.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18890.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17535.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19402.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19422.0] + - - [1024, 3860, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 19042.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18139.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19573.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19736.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18359.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20043.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17583.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19615.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19446.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19290.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 18793.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19881.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18675.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18264.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18383.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19222.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17921.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18598.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19555.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20002.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19959.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20208.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18284.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18591.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20041.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20035.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19481.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18026.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 19019.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18702.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19693.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19945.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18506.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19472.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18956.0] + - - [1024, 4020, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18923.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18447.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20101.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18098.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19719.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19956.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17765.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19074.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 19657.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18708.0] + - - [1024, 3870, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 19060.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19586.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17855.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19569.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17965.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19115.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19665.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19525.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18599.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19622.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20295.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19446.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19274.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17922.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19750.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19712.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19229.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19133.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19667.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18795.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20337.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17991.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19319.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17615.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19773.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20191.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19324.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19317.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17799.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18559.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17836.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19718.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19497.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19414.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19879.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19149.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18405.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20071.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19049.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17773.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19464.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19670.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19961.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19909.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19111.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17933.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18362.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19333.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19792.0] + - - [1024, 3956, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19506.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18451.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19273.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18890.0] + - - [1024, 3944, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19422.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17657.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19342.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19552.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17693.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19114.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18936.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18558.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17410.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19679.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19928.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19182.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17659.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19057.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20274.0] + - - [1024, 3969, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18712.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18070.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18927.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19970.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18511.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19666.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19963.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18176.0] + - - [1024, 3978, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18744.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19115.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19508.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19795.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18686.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19662.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18749.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19036.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19934.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19027.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17754.0] + - - [1024, 3968, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19544.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19505.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19724.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 18682.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20242.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19225.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19614.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17631.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17943.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19867.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18028.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19607.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19096.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19246.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18953.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18799.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17540.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19560.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17799.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19446.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20175.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19914.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20216.0] + - - [1024, 3640, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18434.0] + - - [1024, 3995, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18831.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18185.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20015.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19687.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19191.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17574.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19423.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19368.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19820.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18417.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19058.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19806.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19620.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19210.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18210.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19911.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19906.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18615.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19081.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19321.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19584.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19194.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17942.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19809.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19547.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19996.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18947.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19122.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18343.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18996.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18881.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19236.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19765.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19653.0] + - - [1024, 3939, 1, 33708, 1024, 1024, 1024, 33708] + - [7, 19415.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19514.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19054.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18928.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17574.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20032.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19433.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19011.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20181.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18057.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18059.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17770.0] + - - [1024, 3975, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18747.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19636.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17865.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19558.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18057.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18439.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18937.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19035.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20215.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19786.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17703.0] + - - [1024, 3977, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18716.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19184.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19514.0] + - - [1024, 4026, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18951.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18885.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18957.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19611.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20012.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18868.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19360.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18252.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19126.0] + - - [1024, 3584, 1, 33708, 1024, 1024, 1024, 33708] + - [29, 18169.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19225.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17972.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18613.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19820.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20112.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19642.0] + - - [1024, 3900, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 19236.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19061.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19228.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18901.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19685.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17989.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17608.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18067.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19840.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20164.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19287.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19115.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19811.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19748.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19668.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18002.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19110.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19517.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19319.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18148.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19411.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19594.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18143.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19879.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19641.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19533.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 18764.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19722.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20336.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20051.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19129.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19429.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17845.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18820.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19341.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19746.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20040.0] + - - [1024, 3796, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 18779.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18829.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19913.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17655.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19491.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19448.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17860.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18082.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19815.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19577.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 18968.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19342.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19395.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19617.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20002.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20111.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17648.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20224.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20031.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19606.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17699.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20041.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19514.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19143.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19461.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19792.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17679.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19666.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19971.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19060.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20164.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19311.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17677.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18269.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19571.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19723.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20004.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 18131.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18798.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18754.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19183.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19055.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 17863.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19859.0] + - - [1024, 3720, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 18371.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18731.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18820.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17569.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19639.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19182.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19426.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20157.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18657.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19646.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20294.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 17986.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17746.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19414.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19602.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19756.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19717.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19880.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19732.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19762.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19408.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19500.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19651.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17944.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19333.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19539.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19819.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 18223.0] + - - [1024, 3840, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 18979.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19592.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19563.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 19050.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19701.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18617.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19362.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19201.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 17954.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19412.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19888.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19620.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20053.0] + - - [1024, 4012, 1, 33708, 1024, 1024, 1024, 33708] + - [14, 18899.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19631.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19456.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 18245.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20015.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19919.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19906.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 18408.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19071.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 18849.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17526.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 17572.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20006.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18622.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19483.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18990.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20043.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19663.0] + - - [36548, 1216, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 19346.0] + - - [1024, 2592, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 17004.0] + - - [1024, 1568, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 16548.0] + - - [1024, 4445, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18864.0] + - - [1024, 6272, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19736.0] + - - [36548, 3584, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20609.0] + - - [1024, 1827, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 16492.0] + - - [1024, 3220, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 17969.0] + - - [1024, 1856, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 16853.0] + - - [1024, 1760, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 15912.0] + - - [36548, 4235, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20054.0] + - - [1024, 1984, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17925.0] + - - [1024, 14720, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 20051.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 17512.0] + - - [36548, 14976, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20678.0] + - - [36548, 1152, 1, 1024, 36548, 36548, 36548, 1024] + - [4, 20216.0] + - - [1024, 3392, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18410.0] + - - [1024, 1408, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17292.0] + - - [1024, 2080, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 16960.0] + - - [1024, 1824, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 16482.0] + - - [36548, 2432, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20484.0] + - - [36548, 1827, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 19479.0] + - - [1024, 10176, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19757.0] + - - [1024, 1952, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17533.0] + - - [1024, 17024, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 20033.0] + - - [1024, 1472, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 15500.0] + - - [36548, 4459, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20490.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18517.0] + - - [36548, 12928, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20667.0] + - - [1024, 1632, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 16907.0] + - - [1024, 1696, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17531.0] + - - [36548, 1764, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20092.0] + - - [1024, 2944, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17372.0] + - - [36548, 14080, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20670.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 15702.0] + - - [1024, 13440, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19772.0] + - - [36548, 9120, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20458.0] + - - [1024, 3008, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 17708.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18064.0] + - - [1024, 2208, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 17909.0] + - - [1024, 1920, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17422.0] + - - [36548, 2496, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20040.0] + - - [1024, 2016, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 16269.0] + - - [1024, 1184, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 14463.0] + - - [1024, 1664, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 17596.0] + - - [1024, 11424, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 20071.0] + - - [1024, 1216, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 15065.0] + - - [36548, 3185, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20475.0] + - - [36548, 9216, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20669.0] + - - [1024, 3200, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 17886.0] + - - [1024, 2656, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 17365.0] + - - [1024, 2368, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 16873.0] + - - [1024, 4459, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18946.0] + - - [1024, 3808, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18377.0] + - - [1024, 2336, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 16484.0] + - - [1024, 2304, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 18945.0] + - - [1024, 1560, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 16480.0] + - - [1024, 2496, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17639.0] + - - [1024, 1504, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 15799.0] + - - [1024, 3232, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18040.0] + - - [36548, 1015, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20147.0] + - - [1024, 2000, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 16167.0] + - - [36548, 243, 1, 1024, 36548, 36548, 36548, 1024] + - [11, 18577.0] + - - [1024, 13184, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 20194.0] + - - [1024, 2688, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 17810.0] + - - [36548, 950, 1, 1024, 36548, 36548, 36548, 1024] + - [29, 19014.0] + - - [1024, 1764, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 15883.0] + - - [1024, 1376, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 16769.0] + - - [36548, 774, 1, 1024, 36548, 36548, 36548, 1024] + - [29, 17865.0] + - - [1024, 4256, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18418.0] + - - [36548, 3712, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20576.0] + - - [1024, 3360, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18240.0] + - - [1024, 2784, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18250.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19084.0] + - - [36548, 1102, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 19437.0] + - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 16203.0] + - - [1024, 2720, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 17799.0] + - - [1024, 2752, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18090.0] + - - [1024, 2816, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18592.0] + - - [1024, 2624, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 17303.0] + - - [1024, 2144, 1, 1024, 1024, 1024, 1024, 1024] + - [25, 17411.0] + - - [36548, 1131, 1, 1024, 36548, 36548, 36548, 1024] + - [19, 19890.0] + - - [1024, 3296, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 17957.0] + - - [36548, 4992, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20635.0] + - - [1024, 1344, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 16372.0] + - - [36548, 2401, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20206.0] + - - [1024, 15744, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 19885.0] + - - [1024, 15232, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 20038.0] + - - [1024, 1888, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 16990.0] + - - [1024, 1792, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 16334.0] + - - [36548, 1073, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 18981.0] + - - [36548, 15488, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20665.0] + - - [1024, 2464, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17366.0] + - - [1024, 2272, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18348.0] + - - [1024, 2432, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17224.0] + - - [1024, 3936, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18970.0] + - - [36548, 13824, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20679.0] + - - [1024, 2401, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 16931.0] + - - [1024, 2176, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 17868.0] + - - [1024, 2240, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18129.0] + - - [1024, 1728, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17990.0] + - - [1024, 2528, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17822.0] + - - [1024, 2400, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 16995.0] + - - [1024, 1440, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 15274.0] + - - [1024, 2912, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17133.0] + - - [1024, 2880, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 18500.0] + - - [1024, 4064, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17763.0] + - - [1024, 4655, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 17929.0] + - - [36548, 6272, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 20650.0] + - - [768, 2048, 1, 3072, 768, 768, 768, 3072] + - [8, 16637.0] + - - [768, 4096, 1, 3072, 768, 768, 768, 3072] + - [22, 18282.0] + - - [6272, 256, 1, 528, 6272, 6272, 6272, 528] + - [16, 16545.0] + - - [3136, 2048, 1, 1024, 3136, 3136, 3136, 1024] + - [23, 18266.0] + - - [50176, 128, 1, 256, 50176, 50176, 50176, 256] + - [36, 19100.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [30, 19702.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [0, 18538.0] + - - [3136, 512, 1, 1024, 3136, 3136, 3136, 1024] + - [8, 16594.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [23, 18052.0] + - - [289, 384, 32, 1024, 289, 289, 289, 1024] + - [23, 14617.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [21, 17563.0] + - - [50176, 512, 1, 256, 50176, 50176, 50176, 256] + - [1, 19939.0] + - - [12544, 1024, 1, 512, 12544, 12544, 12544, 512] + - [1, 19995.0] + - - [12544, 256, 1, 512, 12544, 12544, 12544, 512] + - [14, 18334.0] + - - [784, 128, 32, 256, 784, 784, 784, 256] + - [22, 15034.0] + - - [4096, 512, 1, 9216, 4096, 4096, 4096, 9216] + - [21, 17707.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [16, 16798.0] + - - [1225, 192, 32, 384, 1225, 1225, 1225, 384] + - [29, 17898.0] + - - [8192, 320, 1, 1280, 8192, 8192, 8192, 1280] + - [14, 18234.0] + - - [8192, 320, 1, 2048, 8192, 8192, 8192, 2048] + - [0, 18276.0] + - - [8192, 384, 1, 1280, 8192, 8192, 8192, 1280] + - [0, 18197.0] + - - [8192, 384, 1, 2048, 8192, 8192, 8192, 2048] + - [0, 18254.0] + - - [8192, 448, 1, 2048, 8192, 8192, 8192, 2048] + - [14, 18134.0] + - - [8192, 448, 1, 1280, 8192, 8192, 8192, 1280] + - [14, 18075.0] + - - [256, 6400, 1, 4096, 256, 256, 256, 4096] + - [23, 17244.0] + - - [512, 3433, 1, 2048, 512, 512, 512, 2048] + - [37, 18323.0] + - - [512, 3439, 1, 2048, 512, 512, 512, 2048] + - [24, 18363.0] + - - [512, 3461, 1, 2048, 512, 512, 512, 2048] + - [35, 15935.0] + - - [512, 3479, 1, 2048, 512, 512, 512, 2048] + - [22, 16015.0] + - - [512, 3494, 1, 2048, 512, 512, 512, 2048] + - [6, 16060.0] + - - [512, 3520, 1, 2048, 512, 512, 512, 2048] + - [6, 16227.0] + - - [512, 3530, 1, 2048, 512, 512, 512, 2048] + - [22, 16239.0] + - - [512, 3541, 1, 2048, 512, 512, 512, 2048] + - [35, 16257.0] + - - [512, 3564, 1, 2048, 512, 512, 512, 2048] + - [22, 16364.0] + - - [512, 3776, 1, 2048, 512, 512, 512, 2048] + - [35, 17346.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [6, 16866.0] + - - [512, 3925, 1, 2048, 512, 512, 512, 2048] + - [35, 17944.0] + - - [512, 3944, 1, 2048, 512, 512, 512, 2048] + - [6, 18048.0] + - - [512, 3955, 1, 2048, 512, 512, 512, 2048] + - [22, 18075.0] + - - [512, 3969, 1, 2048, 512, 512, 512, 2048] + - [13, 16750.0] + - - [512, 3976, 1, 2048, 512, 512, 512, 2048] + - [13, 16783.0] + - - [2048, 1232, 1, 512, 2048, 2048, 2048, 512] + - [29, 17138.0] + - - [2048, 3165, 1, 512, 2048, 2048, 2048, 512] + - [29, 18151.0] + - - [512, 2387, 1, 512, 512, 512, 512, 512] + - [35, 14254.0] + - - [512, 2418, 1, 512, 512, 512, 512, 512] + - [35, 14347.0] + - - [512, 2418, 1, 2048, 512, 512, 512, 2048] + - [35, 15121.0] + - - [512, 2496, 1, 512, 512, 512, 512, 512] + - [35, 15160.0] + - - [512, 2496, 1, 2048, 512, 512, 512, 2048] + - [6, 15661.0] + - - [512, 2790, 1, 2048, 512, 512, 512, 2048] + - [6, 17352.0] + - - [512, 2864, 1, 2048, 512, 512, 512, 2048] + - [8, 15455.0] + - - [512, 3092, 1, 2048, 512, 512, 512, 2048] + - [8, 16642.0] + - - [512, 3113, 1, 2048, 512, 512, 512, 2048] + - [37, 16758.0] + - - [512, 3137, 1, 2048, 512, 512, 512, 2048] + - [37, 16844.0] + - - [512, 3165, 1, 2048, 512, 512, 512, 2048] + - [24, 16993.0] + - - [512, 3166, 1, 2048, 512, 512, 512, 2048] + - [24, 17035.0] + - - [512, 3194, 1, 2048, 512, 512, 512, 2048] + - [8, 17143.0] + - - [512, 3219, 1, 2048, 512, 512, 512, 2048] + - [8, 17306.0] + - - [512, 3222, 1, 2048, 512, 512, 512, 2048] + - [24, 17287.0] + - - [512, 3234, 1, 2048, 512, 512, 512, 2048] + - [36, 17339.0] + - - [512, 3237, 1, 2048, 512, 512, 512, 2048] + - [8, 17390.0] + - - [512, 3242, 1, 2048, 512, 512, 512, 2048] + - [37, 17412.0] + - - [512, 3246, 1, 2048, 512, 512, 512, 2048] + - [24, 17417.0] + - - [512, 3249, 1, 2048, 512, 512, 512, 2048] + - [24, 17446.0] + - - [512, 3251, 1, 2048, 512, 512, 512, 2048] + - [24, 17457.0] + - - [512, 3257, 1, 2048, 512, 512, 512, 2048] + - [23, 17462.0] + - - [512, 3262, 1, 2048, 512, 512, 512, 2048] + - [8, 17512.0] + - - [512, 3268, 1, 2048, 512, 512, 512, 2048] + - [8, 17537.0] + - - [512, 3282, 1, 2048, 512, 512, 512, 2048] + - [24, 17605.0] + - - [512, 3286, 1, 2048, 512, 512, 512, 2048] + - [8, 17616.0] + - - [512, 3287, 1, 2048, 512, 512, 512, 2048] + - [8, 17625.0] + - - [512, 3293, 1, 2048, 512, 512, 512, 2048] + - [8, 17653.0] + - - [512, 3297, 1, 2048, 512, 512, 512, 2048] + - [24, 17658.0] + - - [512, 3307, 1, 2048, 512, 512, 512, 2048] + - [37, 17719.0] + - - [512, 3314, 1, 2048, 512, 512, 512, 2048] + - [37, 17728.0] + - - [512, 3315, 1, 2048, 512, 512, 512, 2048] + - [8, 17760.0] + - - [512, 3319, 1, 2048, 512, 512, 512, 2048] + - [8, 17724.0] + - - [512, 3322, 1, 2048, 512, 512, 512, 2048] + - [37, 17756.0] + - - [512, 3323, 1, 2048, 512, 512, 512, 2048] + - [36, 17750.0] + - - [512, 3324, 1, 2048, 512, 512, 512, 2048] + - [23, 17776.0] + - - [512, 3325, 1, 2048, 512, 512, 512, 2048] + - [24, 17799.0] + - - [512, 3327, 1, 2048, 512, 512, 512, 2048] + - [8, 17788.0] + - - [512, 3329, 1, 2048, 512, 512, 512, 2048] + - [37, 17852.0] + - - [512, 3332, 1, 2048, 512, 512, 512, 2048] + - [24, 17864.0] + - - [512, 3336, 1, 2048, 512, 512, 512, 2048] + - [8, 17860.0] + - - [512, 3339, 1, 2048, 512, 512, 512, 2048] + - [37, 17891.0] + - - [512, 3342, 1, 2048, 512, 512, 512, 2048] + - [8, 17919.0] + - - [512, 3344, 1, 2048, 512, 512, 512, 2048] + - [8, 17912.0] + - - [512, 3358, 1, 2048, 512, 512, 512, 2048] + - [8, 17970.0] + - - [512, 3360, 1, 2048, 512, 512, 512, 2048] + - [37, 17998.0] + - - [512, 3364, 1, 2048, 512, 512, 512, 2048] + - [8, 17997.0] + - - [512, 3365, 1, 2048, 512, 512, 512, 2048] + - [37, 18021.0] + - - [512, 3369, 1, 2048, 512, 512, 512, 2048] + - [24, 18031.0] + - - [512, 3370, 1, 2048, 512, 512, 512, 2048] + - [8, 18048.0] + - - [512, 3371, 1, 2048, 512, 512, 512, 2048] + - [37, 18033.0] + - - [512, 3374, 1, 2048, 512, 512, 512, 2048] + - [8, 18078.0] + - - [512, 3376, 1, 2048, 512, 512, 512, 2048] + - [37, 18076.0] + - - [512, 3377, 1, 2048, 512, 512, 512, 2048] + - [8, 18074.0] + - - [512, 3378, 1, 2048, 512, 512, 512, 2048] + - [37, 18088.0] + - - [512, 3381, 1, 2048, 512, 512, 512, 2048] + - [37, 18105.0] + - - [512, 3382, 1, 2048, 512, 512, 512, 2048] + - [8, 18138.0] + - - [512, 3383, 1, 2048, 512, 512, 512, 2048] + - [24, 18121.0] + - - [512, 3384, 1, 2048, 512, 512, 512, 2048] + - [37, 18106.0] + - - [512, 3385, 1, 2048, 512, 512, 512, 2048] + - [8, 18111.0] + - - [512, 3386, 1, 2048, 512, 512, 512, 2048] + - [8, 18128.0] + - - [512, 3388, 1, 2048, 512, 512, 512, 2048] + - [24, 18142.0] + - - [512, 3390, 1, 2048, 512, 512, 512, 2048] + - [8, 18129.0] + - - [512, 3391, 1, 2048, 512, 512, 512, 2048] + - [8, 18112.0] + - - [512, 3396, 1, 2048, 512, 512, 512, 2048] + - [37, 18207.0] + - - [512, 3399, 1, 2048, 512, 512, 512, 2048] + - [24, 18182.0] + - - [512, 3402, 1, 2048, 512, 512, 512, 2048] + - [24, 18230.0] + - - [512, 3410, 1, 2048, 512, 512, 512, 2048] + - [24, 18215.0] + - - [512, 3412, 1, 2048, 512, 512, 512, 2048] + - [37, 18211.0] + - - [512, 3414, 1, 2048, 512, 512, 512, 2048] + - [37, 18261.0] + - - [512, 3415, 1, 2048, 512, 512, 512, 2048] + - [37, 18288.0] + - - [512, 3418, 1, 2048, 512, 512, 512, 2048] + - [24, 18260.0] + - - [512, 3420, 1, 2048, 512, 512, 512, 2048] + - [24, 18287.0] + - - [512, 3422, 1, 2048, 512, 512, 512, 2048] + - [24, 18311.0] + - - [512, 3425, 1, 2048, 512, 512, 512, 2048] + - [24, 18308.0] + - - [512, 3426, 1, 2048, 512, 512, 512, 2048] + - [37, 18312.0] + - - [512, 3427, 1, 2048, 512, 512, 512, 2048] + - [37, 18306.0] + - - [512, 3428, 1, 2048, 512, 512, 512, 2048] + - [8, 18296.0] + - - [512, 3430, 1, 2048, 512, 512, 512, 2048] + - [8, 18287.0] + - - [512, 3431, 1, 2048, 512, 512, 512, 2048] + - [24, 18354.0] + - - [512, 3432, 1, 2048, 512, 512, 512, 2048] + - [8, 18387.0] + - - [512, 3438, 1, 2048, 512, 512, 512, 2048] + - [24, 18370.0] + - - [512, 3440, 1, 2048, 512, 512, 512, 2048] + - [24, 18353.0] + - - [512, 3443, 1, 2048, 512, 512, 512, 2048] + - [37, 18427.0] + - - [512, 3445, 1, 2048, 512, 512, 512, 2048] + - [24, 18382.0] + - - [512, 3447, 1, 2048, 512, 512, 512, 2048] + - [8, 18351.0] + - - [512, 3448, 1, 2048, 512, 512, 512, 2048] + - [37, 18329.0] + - - [512, 3450, 1, 2048, 512, 512, 512, 2048] + - [24, 18393.0] + - - [512, 3451, 1, 2048, 512, 512, 512, 2048] + - [8, 18423.0] + - - [512, 3452, 1, 2048, 512, 512, 512, 2048] + - [8, 18333.0] + - - [512, 3453, 1, 2048, 512, 512, 512, 2048] + - [37, 18464.0] + - - [512, 3455, 1, 2048, 512, 512, 512, 2048] + - [24, 18439.0] + - - [512, 3456, 1, 2048, 512, 512, 512, 2048] + - [37, 18465.0] + - - [512, 3457, 1, 2048, 512, 512, 512, 2048] + - [22, 15935.0] + - - [512, 3458, 1, 2048, 512, 512, 512, 2048] + - [22, 15938.0] + - - [512, 3459, 1, 2048, 512, 512, 512, 2048] + - [35, 15926.0] + - - [512, 3460, 1, 2048, 512, 512, 512, 2048] + - [35, 15943.0] + - - [512, 3462, 1, 2048, 512, 512, 512, 2048] + - [6, 15941.0] + - - [512, 3466, 1, 2048, 512, 512, 512, 2048] + - [22, 16005.0] + - - [512, 3467, 1, 2048, 512, 512, 512, 2048] + - [6, 15980.0] + - - [512, 3468, 1, 2048, 512, 512, 512, 2048] + - [35, 16007.0] + - - [512, 3470, 1, 2048, 512, 512, 512, 2048] + - [35, 15963.0] + - - [512, 3471, 1, 2048, 512, 512, 512, 2048] + - [22, 16120.0] + - - [512, 3472, 1, 2048, 512, 512, 512, 2048] + - [22, 15997.0] + - - [512, 3475, 1, 2048, 512, 512, 512, 2048] + - [6, 15980.0] + - - [512, 3476, 1, 2048, 512, 512, 512, 2048] + - [22, 16013.0] + - - [512, 3477, 1, 2048, 512, 512, 512, 2048] + - [6, 16107.0] + - - [512, 3478, 1, 2048, 512, 512, 512, 2048] + - [35, 15993.0] + - - [512, 3480, 1, 2048, 512, 512, 512, 2048] + - [22, 16031.0] + - - [512, 3481, 1, 2048, 512, 512, 512, 2048] + - [35, 16023.0] + - - [512, 3483, 1, 2048, 512, 512, 512, 2048] + - [35, 16039.0] + - - [512, 3484, 1, 2048, 512, 512, 512, 2048] + - [6, 16034.0] + - - [512, 3487, 1, 2048, 512, 512, 512, 2048] + - [22, 16059.0] + - - [512, 3489, 1, 2048, 512, 512, 512, 2048] + - [22, 16050.0] + - - [512, 3490, 1, 2048, 512, 512, 512, 2048] + - [6, 16070.0] + - - [512, 3491, 1, 2048, 512, 512, 512, 2048] + - [22, 16059.0] + - - [512, 3493, 1, 2048, 512, 512, 512, 2048] + - [22, 16035.0] + - - [512, 3495, 1, 2048, 512, 512, 512, 2048] + - [22, 16045.0] + - - [512, 3497, 1, 2048, 512, 512, 512, 2048] + - [22, 16088.0] + - - [512, 3498, 1, 2048, 512, 512, 512, 2048] + - [6, 16093.0] + - - [512, 3499, 1, 2048, 512, 512, 512, 2048] + - [22, 16109.0] + - - [512, 3501, 1, 2048, 512, 512, 512, 2048] + - [35, 16149.0] + - - [512, 3503, 1, 2048, 512, 512, 512, 2048] + - [35, 16149.0] + - - [512, 3505, 1, 2048, 512, 512, 512, 2048] + - [6, 16103.0] + - - [512, 3507, 1, 2048, 512, 512, 512, 2048] + - [35, 16150.0] + - - [512, 3508, 1, 2048, 512, 512, 512, 2048] + - [35, 16121.0] + - - [512, 3509, 1, 2048, 512, 512, 512, 2048] + - [22, 16182.0] + - - [512, 3510, 1, 2048, 512, 512, 512, 2048] + - [35, 16169.0] + - - [512, 3511, 1, 2048, 512, 512, 512, 2048] + - [6, 16150.0] + - - [512, 3513, 1, 2048, 512, 512, 512, 2048] + - [6, 16135.0] + - - [512, 3514, 1, 2048, 512, 512, 512, 2048] + - [35, 16097.0] + - - [512, 3515, 1, 2048, 512, 512, 512, 2048] + - [6, 16168.0] + - - [512, 3517, 1, 2048, 512, 512, 512, 2048] + - [6, 16162.0] + - - [512, 3518, 1, 2048, 512, 512, 512, 2048] + - [6, 16164.0] + - - [512, 3519, 1, 2048, 512, 512, 512, 2048] + - [6, 16136.0] + - - [512, 3523, 1, 2048, 512, 512, 512, 2048] + - [6, 16228.0] + - - [512, 3528, 1, 2048, 512, 512, 512, 2048] + - [6, 16222.0] + - - [512, 3529, 1, 2048, 512, 512, 512, 2048] + - [22, 16206.0] + - - [512, 3531, 1, 2048, 512, 512, 512, 2048] + - [22, 16205.0] + - - [512, 3532, 1, 2048, 512, 512, 512, 2048] + - [22, 16229.0] + - - [512, 3533, 1, 2048, 512, 512, 512, 2048] + - [22, 16254.0] + - - [512, 3534, 1, 2048, 512, 512, 512, 2048] + - [22, 16223.0] + - - [512, 3538, 1, 2048, 512, 512, 512, 2048] + - [22, 16260.0] + - - [512, 3539, 1, 2048, 512, 512, 512, 2048] + - [6, 16255.0] + - - [512, 3540, 1, 2048, 512, 512, 512, 2048] + - [22, 16252.0] + - - [512, 3547, 1, 2048, 512, 512, 512, 2048] + - [22, 16301.0] + - - [512, 3548, 1, 2048, 512, 512, 512, 2048] + - [22, 16300.0] + - - [512, 3552, 1, 2048, 512, 512, 512, 2048] + - [22, 16318.0] + - - [512, 3575, 1, 2048, 512, 512, 512, 2048] + - [6, 16404.0] + - - [512, 3598, 1, 2048, 512, 512, 512, 2048] + - [22, 16534.0] + - - [512, 3599, 1, 2048, 512, 512, 512, 2048] + - [22, 16517.0] + - - [512, 3608, 1, 2048, 512, 512, 512, 2048] + - [22, 16577.0] + - - [512, 3776, 1, 512, 512, 512, 512, 512] + - [35, 16863.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [6, 16587.0] + - - [512, 3780, 1, 2048, 512, 512, 512, 2048] + - [6, 17333.0] + - - [512, 3780, 1, 33708, 512, 512, 512, 33708] + - [0, 17480.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [22, 15988.0] + - - [512, 3796, 1, 2048, 512, 512, 512, 2048] + - [22, 17417.0] + - - [512, 3796, 1, 33708, 512, 512, 512, 33708] + - [0, 17531.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [35, 16186.0] + - - [512, 3822, 1, 2048, 512, 512, 512, 2048] + - [6, 17449.0] + - - [512, 3822, 1, 33708, 512, 512, 512, 33708] + - [14, 17646.0] + - - [512, 3835, 1, 512, 512, 512, 512, 512] + - [35, 16283.0] + - - [512, 3835, 1, 2048, 512, 512, 512, 2048] + - [22, 17522.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [22, 17131.0] + - - [512, 3840, 1, 2048, 512, 512, 512, 2048] + - [22, 17597.0] + - - [512, 3840, 1, 33708, 512, 512, 512, 33708] + - [29, 17723.0] + - - [512, 3859, 1, 2048, 512, 512, 512, 2048] + - [22, 17610.0] + - - [512, 3859, 1, 33708, 512, 512, 512, 33708] + - [22, 17792.0] + - - [512, 3864, 1, 512, 512, 512, 512, 512] + - [35, 16369.0] + - - [512, 3864, 1, 2048, 512, 512, 512, 2048] + - [35, 17693.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [35, 16953.0] + - - [512, 3870, 1, 2048, 512, 512, 512, 2048] + - [35, 17703.0] + - - [512, 3870, 1, 33708, 512, 512, 512, 33708] + - [14, 17814.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [6, 16372.0] + - - [512, 3876, 1, 2048, 512, 512, 512, 2048] + - [6, 17723.0] + - - [512, 3876, 1, 33708, 512, 512, 512, 33708] + - [29, 17836.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [35, 16563.0] + - - [512, 3906, 1, 2048, 512, 512, 512, 2048] + - [22, 17887.0] + - - [512, 3906, 1, 33708, 512, 512, 512, 33708] + - [0, 18023.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [35, 16650.0] + - - [512, 3910, 1, 2048, 512, 512, 512, 2048] + - [35, 17899.0] + - - [512, 3910, 1, 33708, 512, 512, 512, 33708] + - [0, 18001.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [35, 16595.0] + - - [512, 3925, 1, 33708, 512, 512, 512, 33708] + - [29, 18044.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [22, 16647.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [35, 17304.0] + - - [512, 3942, 1, 2048, 512, 512, 512, 2048] + - [6, 18038.0] + - - [512, 3942, 1, 33708, 512, 512, 512, 33708] + - [29, 18138.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [35, 16708.0] + - - [512, 3944, 1, 33708, 512, 512, 512, 33708] + - [14, 18136.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [22, 16755.0] + - - [512, 3955, 1, 33708, 512, 512, 512, 33708] + - [14, 18201.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [35, 17547.0] + - - [512, 3968, 1, 2048, 512, 512, 512, 2048] + - [35, 18184.0] + - - [512, 3968, 1, 33708, 512, 512, 512, 33708] + - [0, 18262.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [22, 15580.0] + - - [512, 3969, 1, 33708, 512, 512, 512, 33708] + - [34, 17246.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [22, 15519.0] + - - [512, 3976, 1, 33708, 512, 512, 512, 33708] + - [5, 17274.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [6, 15441.0] + - - [512, 3977, 1, 2048, 512, 512, 512, 2048] + - [40, 16765.0] + - - [512, 3977, 1, 33708, 512, 512, 512, 33708] + - [21, 17293.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [22, 15486.0] + - - [512, 3978, 1, 2048, 512, 512, 512, 2048] + - [13, 16782.0] + - - [512, 3978, 1, 33708, 512, 512, 512, 33708] + - [21, 17294.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [22, 15621.0] + - - [512, 3990, 1, 2048, 512, 512, 512, 2048] + - [40, 16771.0] + - - [512, 3990, 1, 33708, 512, 512, 512, 33708] + - [34, 17350.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [35, 15720.0] + - - [512, 3995, 1, 2048, 512, 512, 512, 2048] + - [40, 16807.0] + - - [512, 3995, 1, 33708, 512, 512, 512, 33708] + - [21, 17373.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [37, 15630.0] + - - [512, 3996, 1, 2048, 512, 512, 512, 2048] + - [40, 16821.0] + - - [512, 3996, 1, 33708, 512, 512, 512, 33708] + - [21, 17383.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [22, 15693.0] + - - [512, 3999, 1, 2048, 512, 512, 512, 2048] + - [40, 16846.0] + - - [512, 3999, 1, 33708, 512, 512, 512, 33708] + - [21, 17386.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [22, 15740.0] + - - [512, 4005, 1, 2048, 512, 512, 512, 2048] + - [40, 16826.0] + - - [512, 4005, 1, 33708, 512, 512, 512, 33708] + - [21, 17417.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [35, 15735.0] + - - [512, 4012, 1, 2048, 512, 512, 512, 2048] + - [13, 16896.0] + - - [512, 4012, 1, 33708, 512, 512, 512, 33708] + - [21, 17438.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [37, 15743.0] + - - [512, 4020, 1, 2048, 512, 512, 512, 2048] + - [13, 16944.0] + - - [512, 4020, 1, 33708, 512, 512, 512, 33708] + - [21, 17483.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [35, 15762.0] + - - [512, 4026, 1, 2048, 512, 512, 512, 2048] + - [28, 16932.0] + - - [512, 4026, 1, 33708, 512, 512, 512, 33708] + - [34, 17502.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [37, 15796.0] + - - [512, 4030, 1, 2048, 512, 512, 512, 2048] + - [13, 16967.0] + - - [512, 4030, 1, 33708, 512, 512, 512, 33708] + - [21, 17518.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [35, 15952.0] + - - [512, 4032, 1, 2048, 512, 512, 512, 2048] + - [13, 16990.0] + - - [512, 4032, 1, 33708, 512, 512, 512, 33708] + - [21, 17533.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [6, 15822.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [36, 15967.0] + - - [2048, 644, 1, 512, 2048, 2048, 2048, 512] + - [29, 15354.0] + - - [2048, 668, 1, 512, 2048, 2048, 2048, 512] + - [29, 15912.0] + - - [2048, 714, 1, 512, 2048, 2048, 2048, 512] + - [29, 14600.0] + - - [2048, 720, 1, 512, 2048, 2048, 2048, 512] + - [29, 14734.0] + - - [2048, 722, 1, 512, 2048, 2048, 2048, 512] + - [29, 14758.0] + - - [2048, 781, 1, 512, 2048, 2048, 2048, 512] + - [29, 15852.0] + - - [2048, 848, 1, 512, 2048, 2048, 2048, 512] + - [35, 15010.0] + - - [2048, 872, 1, 512, 2048, 2048, 2048, 512] + - [35, 15404.0] + - - [2048, 936, 1, 512, 2048, 2048, 2048, 512] + - [6, 16468.0] + - - [2048, 980, 1, 512, 2048, 2048, 2048, 512] + - [29, 15551.0] + - - [2048, 1139, 1, 512, 2048, 2048, 2048, 512] + - [29, 17564.0] + - - [2048, 1184, 1, 512, 2048, 2048, 2048, 512] + - [29, 16518.0] + - - [2048, 1186, 1, 512, 2048, 2048, 2048, 512] + - [29, 16498.0] + - - [2048, 1279, 1, 512, 2048, 2048, 2048, 512] + - [22, 17508.0] + - - [2048, 1290, 1, 512, 2048, 2048, 2048, 512] + - [23, 16444.0] + - - [2048, 1327, 1, 512, 2048, 2048, 2048, 512] + - [35, 16817.0] + - - [2048, 1331, 1, 512, 2048, 2048, 2048, 512] + - [23, 16958.0] + - - [2048, 1341, 1, 512, 2048, 2048, 2048, 512] + - [23, 17106.0] + - - [2048, 1350, 1, 512, 2048, 2048, 2048, 512] + - [23, 17175.0] + - - [2048, 1359, 1, 512, 2048, 2048, 2048, 512] + - [29, 17231.0] + - - [2048, 1391, 1, 512, 2048, 2048, 2048, 512] + - [23, 17620.0] + - - [2048, 1424, 1, 512, 2048, 2048, 2048, 512] + - [29, 16547.0] + - - [2048, 1458, 1, 512, 2048, 2048, 2048, 512] + - [29, 16938.0] + - - [2048, 1462, 1, 512, 2048, 2048, 2048, 512] + - [29, 17003.0] + - - [2048, 1467, 1, 512, 2048, 2048, 2048, 512] + - [29, 17065.0] + - - [2048, 1472, 1, 512, 2048, 2048, 2048, 512] + - [29, 17265.0] + - - [2048, 1520, 1, 512, 2048, 2048, 2048, 512] + - [29, 17666.0] + - - [2048, 1596, 1, 512, 2048, 2048, 2048, 512] + - [7, 17189.0] + - - [2048, 1599, 1, 512, 2048, 2048, 2048, 512] + - [7, 17275.0] + - - [2048, 1615, 1, 512, 2048, 2048, 2048, 512] + - [7, 17422.0] + - - [2048, 1680, 1, 512, 2048, 2048, 2048, 512] + - [29, 17888.0] + - - [2048, 1709, 1, 512, 2048, 2048, 2048, 512] + - [29, 18141.0] + - - [2048, 1902, 1, 512, 2048, 2048, 2048, 512] + - [23, 17836.0] + - - [2048, 1917, 1, 512, 2048, 2048, 2048, 512] + - [23, 17919.0] + - - [2048, 2076, 1, 512, 2048, 2048, 2048, 512] + - [29, 17822.0] + - - [2048, 2195, 1, 512, 2048, 2048, 2048, 512] + - [23, 18149.0] + - - [2048, 2205, 1, 512, 2048, 2048, 2048, 512] + - [7, 18249.0] + - - [2048, 2418, 1, 512, 2048, 2048, 2048, 512] + - [29, 18453.0] + - - [2048, 2496, 1, 512, 2048, 2048, 2048, 512] + - [23, 18504.0] + - - [2048, 2790, 1, 512, 2048, 2048, 2048, 512] + - [7, 18710.0] + - - [2048, 2864, 1, 512, 2048, 2048, 2048, 512] + - [29, 18685.0] + - - [2048, 3092, 1, 512, 2048, 2048, 2048, 512] + - [22, 18320.0] + - - [2048, 3113, 1, 512, 2048, 2048, 2048, 512] + - [29, 18521.0] + - - [2048, 3137, 1, 512, 2048, 2048, 2048, 512] + - [29, 17965.0] + - - [2048, 3166, 1, 512, 2048, 2048, 2048, 512] + - [29, 18078.0] + - - [2048, 3194, 1, 512, 2048, 2048, 2048, 512] + - [7, 18238.0] + - - [2048, 3219, 1, 512, 2048, 2048, 2048, 512] + - [29, 18384.0] + - - [2048, 3222, 1, 512, 2048, 2048, 2048, 512] + - [22, 18353.0] + - - [2048, 3234, 1, 512, 2048, 2048, 2048, 512] + - [29, 18490.0] + - - [2048, 3237, 1, 512, 2048, 2048, 2048, 512] + - [22, 18405.0] + - - [2048, 3242, 1, 512, 2048, 2048, 2048, 512] + - [29, 18526.0] + - - [2048, 3246, 1, 512, 2048, 2048, 2048, 512] + - [29, 18524.0] + - - [2048, 3249, 1, 512, 2048, 2048, 2048, 512] + - [29, 18529.0] + - - [2048, 3251, 1, 512, 2048, 2048, 2048, 512] + - [29, 18561.0] + - - [2048, 3257, 1, 512, 2048, 2048, 2048, 512] + - [29, 18579.0] + - - [2048, 3262, 1, 512, 2048, 2048, 2048, 512] + - [29, 18595.0] + - - [2048, 3268, 1, 512, 2048, 2048, 2048, 512] + - [23, 18511.0] + - - [2048, 3282, 1, 512, 2048, 2048, 2048, 512] + - [23, 18564.0] + - - [2048, 3286, 1, 512, 2048, 2048, 2048, 512] + - [7, 18611.0] + - - [2048, 3287, 1, 512, 2048, 2048, 2048, 512] + - [23, 18633.0] + - - [2048, 3293, 1, 512, 2048, 2048, 2048, 512] + - [7, 18655.0] + - - [2048, 3297, 1, 512, 2048, 2048, 2048, 512] + - [7, 18736.0] + - - [2048, 3307, 1, 512, 2048, 2048, 2048, 512] + - [7, 18746.0] + - - [2048, 3314, 1, 512, 2048, 2048, 2048, 512] + - [7, 18699.0] + - - [2048, 3315, 1, 512, 2048, 2048, 2048, 512] + - [7, 18775.0] + - - [2048, 3319, 1, 512, 2048, 2048, 2048, 512] + - [7, 18731.0] + - - [2048, 3322, 1, 512, 2048, 2048, 2048, 512] + - [23, 18746.0] + - - [2048, 3323, 1, 512, 2048, 2048, 2048, 512] + - [23, 18816.0] + - - [2048, 3324, 1, 512, 2048, 2048, 2048, 512] + - [7, 18812.0] + - - [2048, 3325, 1, 512, 2048, 2048, 2048, 512] + - [7, 18785.0] + - - [2048, 3327, 1, 512, 2048, 2048, 2048, 512] + - [23, 18798.0] + - - [2048, 3329, 1, 512, 2048, 2048, 2048, 512] + - [7, 18769.0] + - - [2048, 3332, 1, 512, 2048, 2048, 2048, 512] + - [7, 18802.0] + - - [2048, 3336, 1, 512, 2048, 2048, 2048, 512] + - [23, 18817.0] + - - [2048, 3339, 1, 512, 2048, 2048, 2048, 512] + - [23, 18929.0] + - - [2048, 3342, 1, 512, 2048, 2048, 2048, 512] + - [23, 18875.0] + - - [2048, 3344, 1, 512, 2048, 2048, 2048, 512] + - [7, 18884.0] + - - [2048, 3358, 1, 512, 2048, 2048, 2048, 512] + - [7, 18953.0] + - - [2048, 3360, 1, 512, 2048, 2048, 2048, 512] + - [23, 19028.0] + - - [2048, 3364, 1, 512, 2048, 2048, 2048, 512] + - [23, 19024.0] + - - [2048, 3365, 1, 512, 2048, 2048, 2048, 512] + - [23, 19036.0] + - - [2048, 3369, 1, 512, 2048, 2048, 2048, 512] + - [7, 18956.0] + - - [2048, 3370, 1, 512, 2048, 2048, 2048, 512] + - [7, 18998.0] + - - [2048, 3371, 1, 512, 2048, 2048, 2048, 512] + - [7, 19018.0] + - - [2048, 3374, 1, 512, 2048, 2048, 2048, 512] + - [23, 19083.0] + - - [2048, 3376, 1, 512, 2048, 2048, 2048, 512] + - [23, 19010.0] + - - [2048, 3377, 1, 512, 2048, 2048, 2048, 512] + - [7, 19011.0] + - - [2048, 3378, 1, 512, 2048, 2048, 2048, 512] + - [7, 19058.0] + - - [2048, 3381, 1, 512, 2048, 2048, 2048, 512] + - [7, 19091.0] + - - [2048, 3382, 1, 512, 2048, 2048, 2048, 512] + - [23, 19115.0] + - - [2048, 3383, 1, 512, 2048, 2048, 2048, 512] + - [7, 19107.0] + - - [2048, 3384, 1, 512, 2048, 2048, 2048, 512] + - [7, 19102.0] + - - [2048, 3385, 1, 512, 2048, 2048, 2048, 512] + - [23, 19103.0] + - - [2048, 3386, 1, 512, 2048, 2048, 2048, 512] + - [23, 19136.0] + - - [2048, 3388, 1, 512, 2048, 2048, 2048, 512] + - [7, 19164.0] + - - [2048, 3390, 1, 512, 2048, 2048, 2048, 512] + - [7, 19074.0] + - - [2048, 3391, 1, 512, 2048, 2048, 2048, 512] + - [23, 19172.0] + - - [2048, 3396, 1, 512, 2048, 2048, 2048, 512] + - [23, 19166.0] + - - [2048, 3399, 1, 512, 2048, 2048, 2048, 512] + - [23, 19187.0] + - - [2048, 3402, 1, 512, 2048, 2048, 2048, 512] + - [23, 19255.0] + - - [2048, 3410, 1, 512, 2048, 2048, 2048, 512] + - [23, 19238.0] + - - [2048, 3412, 1, 512, 2048, 2048, 2048, 512] + - [23, 19252.0] + - - [2048, 3414, 1, 512, 2048, 2048, 2048, 512] + - [23, 19220.0] + - - [2048, 3415, 1, 512, 2048, 2048, 2048, 512] + - [23, 19300.0] + - - [2048, 3418, 1, 512, 2048, 2048, 2048, 512] + - [23, 19250.0] + - - [2048, 3420, 1, 512, 2048, 2048, 2048, 512] + - [7, 19291.0] + - - [2048, 3422, 1, 512, 2048, 2048, 2048, 512] + - [7, 19269.0] + - - [2048, 3425, 1, 512, 2048, 2048, 2048, 512] + - [7, 19236.0] + - - [2048, 3426, 1, 512, 2048, 2048, 2048, 512] + - [23, 19389.0] + - - [2048, 3427, 1, 512, 2048, 2048, 2048, 512] + - [23, 19320.0] + - - [2048, 3428, 1, 512, 2048, 2048, 2048, 512] + - [7, 19286.0] + - - [2048, 3430, 1, 512, 2048, 2048, 2048, 512] + - [23, 19343.0] + - - [2048, 3431, 1, 512, 2048, 2048, 2048, 512] + - [23, 19378.0] + - - [2048, 3432, 1, 512, 2048, 2048, 2048, 512] + - [7, 19335.0] + - - [2048, 3433, 1, 512, 2048, 2048, 2048, 512] + - [23, 19435.0] + - - [2048, 3438, 1, 512, 2048, 2048, 2048, 512] + - [7, 19388.0] + - - [2048, 3439, 1, 512, 2048, 2048, 2048, 512] + - [7, 19367.0] + - - [2048, 3440, 1, 512, 2048, 2048, 2048, 512] + - [23, 19358.0] + - - [2048, 3443, 1, 512, 2048, 2048, 2048, 512] + - [23, 19443.0] + - - [2048, 3445, 1, 512, 2048, 2048, 2048, 512] + - [23, 19413.0] + - - [2048, 3447, 1, 512, 2048, 2048, 2048, 512] + - [23, 19418.0] + - - [2048, 3448, 1, 512, 2048, 2048, 2048, 512] + - [23, 19449.0] + - - [2048, 3450, 1, 512, 2048, 2048, 2048, 512] + - [7, 19424.0] + - - [2048, 3451, 1, 512, 2048, 2048, 2048, 512] + - [23, 19476.0] + - - [2048, 3452, 1, 512, 2048, 2048, 2048, 512] + - [7, 19413.0] + - - [2048, 3453, 1, 512, 2048, 2048, 2048, 512] + - [23, 19418.0] + - - [2048, 3455, 1, 512, 2048, 2048, 2048, 512] + - [23, 19486.0] + - - [2048, 3456, 1, 512, 2048, 2048, 2048, 512] + - [7, 19669.0] + - - [2048, 3457, 1, 512, 2048, 2048, 2048, 512] + - [29, 18254.0] + - - [2048, 3458, 1, 512, 2048, 2048, 2048, 512] + - [23, 18322.0] + - - [2048, 3459, 1, 512, 2048, 2048, 2048, 512] + - [29, 18212.0] + - - [2048, 3460, 1, 512, 2048, 2048, 2048, 512] + - [23, 18307.0] + - - [2048, 3461, 1, 512, 2048, 2048, 2048, 512] + - [29, 18255.0] + - - [2048, 3462, 1, 512, 2048, 2048, 2048, 512] + - [23, 18305.0] + - - [2048, 3466, 1, 512, 2048, 2048, 2048, 512] + - [23, 18324.0] + - - [2048, 3467, 1, 512, 2048, 2048, 2048, 512] + - [29, 18290.0] + - - [2048, 3468, 1, 512, 2048, 2048, 2048, 512] + - [23, 18303.0] + - - [2048, 3470, 1, 512, 2048, 2048, 2048, 512] + - [23, 18314.0] + - - [2048, 3471, 1, 512, 2048, 2048, 2048, 512] + - [23, 18330.0] + - - [2048, 3472, 1, 512, 2048, 2048, 2048, 512] + - [29, 18320.0] + - - [2048, 3475, 1, 512, 2048, 2048, 2048, 512] + - [23, 18371.0] + - - [2048, 3476, 1, 512, 2048, 2048, 2048, 512] + - [29, 18332.0] + - - [2048, 3477, 1, 512, 2048, 2048, 2048, 512] + - [23, 18365.0] + - - [2048, 3478, 1, 512, 2048, 2048, 2048, 512] + - [23, 18395.0] + - - [2048, 3479, 1, 512, 2048, 2048, 2048, 512] + - [23, 18396.0] + - - [2048, 3480, 1, 512, 2048, 2048, 2048, 512] + - [23, 18422.0] + - - [2048, 3481, 1, 512, 2048, 2048, 2048, 512] + - [23, 18412.0] + - - [2048, 3483, 1, 512, 2048, 2048, 2048, 512] + - [29, 18397.0] + - - [2048, 3484, 1, 512, 2048, 2048, 2048, 512] + - [23, 18395.0] + - - [2048, 3487, 1, 512, 2048, 2048, 2048, 512] + - [29, 18370.0] + - - [2048, 3489, 1, 512, 2048, 2048, 2048, 512] + - [29, 18395.0] + - - [2048, 3490, 1, 512, 2048, 2048, 2048, 512] + - [23, 18438.0] + - - [2048, 3491, 1, 512, 2048, 2048, 2048, 512] + - [23, 18393.0] + - - [2048, 3493, 1, 512, 2048, 2048, 2048, 512] + - [23, 18461.0] + - - [2048, 3494, 1, 512, 2048, 2048, 2048, 512] + - [29, 18444.0] + - - [2048, 3495, 1, 512, 2048, 2048, 2048, 512] + - [23, 18457.0] + - - [2048, 3497, 1, 512, 2048, 2048, 2048, 512] + - [23, 18462.0] + - - [2048, 3498, 1, 512, 2048, 2048, 2048, 512] + - [29, 18461.0] + - - [2048, 3501, 1, 512, 2048, 2048, 2048, 512] + - [29, 18475.0] + - - [2048, 3503, 1, 512, 2048, 2048, 2048, 512] + - [23, 18506.0] + - - [2048, 3505, 1, 512, 2048, 2048, 2048, 512] + - [23, 18448.0] + - - [2048, 3507, 1, 512, 2048, 2048, 2048, 512] + - [23, 18516.0] + - - [2048, 3508, 1, 512, 2048, 2048, 2048, 512] + - [23, 18505.0] + - - [2048, 3509, 1, 512, 2048, 2048, 2048, 512] + - [23, 18510.0] + - - [2048, 3510, 1, 512, 2048, 2048, 2048, 512] + - [23, 18530.0] + - - [2048, 3511, 1, 512, 2048, 2048, 2048, 512] + - [23, 18528.0] + - - [2048, 3513, 1, 512, 2048, 2048, 2048, 512] + - [23, 18505.0] + - - [2048, 3514, 1, 512, 2048, 2048, 2048, 512] + - [29, 18514.0] + - - [2048, 3515, 1, 512, 2048, 2048, 2048, 512] + - [23, 18581.0] + - - [2048, 3517, 1, 512, 2048, 2048, 2048, 512] + - [23, 18590.0] + - - [2048, 3518, 1, 512, 2048, 2048, 2048, 512] + - [23, 18541.0] + - - [2048, 3519, 1, 512, 2048, 2048, 2048, 512] + - [23, 18606.0] + - - [2048, 3520, 1, 512, 2048, 2048, 2048, 512] + - [29, 18579.0] + - - [2048, 3523, 1, 512, 2048, 2048, 2048, 512] + - [23, 18612.0] + - - [2048, 3528, 1, 512, 2048, 2048, 2048, 512] + - [23, 18629.0] + - - [2048, 3529, 1, 512, 2048, 2048, 2048, 512] + - [23, 18621.0] + - - [2048, 3530, 1, 512, 2048, 2048, 2048, 512] + - [23, 18660.0] + - - [2048, 3531, 1, 512, 2048, 2048, 2048, 512] + - [29, 18594.0] + - - [2048, 3532, 1, 512, 2048, 2048, 2048, 512] + - [29, 18645.0] + - - [2048, 3533, 1, 512, 2048, 2048, 2048, 512] + - [23, 18687.0] + - - [2048, 3534, 1, 512, 2048, 2048, 2048, 512] + - [23, 18644.0] + - - [2048, 3538, 1, 512, 2048, 2048, 2048, 512] + - [23, 18697.0] + - - [2048, 3539, 1, 512, 2048, 2048, 2048, 512] + - [23, 18651.0] + - - [2048, 3540, 1, 512, 2048, 2048, 2048, 512] + - [7, 18644.0] + - - [2048, 3541, 1, 512, 2048, 2048, 2048, 512] + - [29, 18686.0] + - - [2048, 3547, 1, 512, 2048, 2048, 2048, 512] + - [29, 18739.0] + - - [2048, 3548, 1, 512, 2048, 2048, 2048, 512] + - [23, 18731.0] + - - [2048, 3552, 1, 512, 2048, 2048, 2048, 512] + - [29, 18728.0] + - - [2048, 3564, 1, 512, 2048, 2048, 2048, 512] + - [29, 18763.0] + - - [2048, 3575, 1, 512, 2048, 2048, 2048, 512] + - [29, 18805.0] + - - [2048, 3598, 1, 512, 2048, 2048, 2048, 512] + - [23, 18836.0] + - - [2048, 3599, 1, 512, 2048, 2048, 2048, 512] + - [23, 18858.0] + - - [2048, 3608, 1, 512, 2048, 2048, 2048, 512] + - [7, 18888.0] + - - [2048, 3776, 1, 512, 2048, 2048, 2048, 512] + - [29, 18556.0] + - - [2048, 3780, 1, 512, 2048, 2048, 2048, 512] + - [7, 18565.0] + - - [2048, 3796, 1, 512, 2048, 2048, 2048, 512] + - [23, 18584.0] + - - [2048, 3822, 1, 512, 2048, 2048, 2048, 512] + - [23, 18764.0] + - - [2048, 3835, 1, 512, 2048, 2048, 2048, 512] + - [7, 18751.0] + - - [2048, 3840, 1, 512, 2048, 2048, 2048, 512] + - [7, 18986.0] + - - [2048, 3859, 1, 512, 2048, 2048, 2048, 512] + - [23, 18822.0] + - - [2048, 3864, 1, 512, 2048, 2048, 2048, 512] + - [23, 18919.0] + - - [2048, 3870, 1, 512, 2048, 2048, 2048, 512] + - [7, 18885.0] + - - [2048, 3876, 1, 512, 2048, 2048, 2048, 512] + - [23, 18946.0] + - - [2048, 3906, 1, 512, 2048, 2048, 2048, 512] + - [7, 19073.0] + - - [2048, 3910, 1, 512, 2048, 2048, 2048, 512] + - [23, 19121.0] + - - [2048, 3925, 1, 512, 2048, 2048, 2048, 512] + - [7, 19112.0] + - - [2048, 3942, 1, 512, 2048, 2048, 2048, 512] + - [23, 19208.0] + - - [2048, 3944, 1, 512, 2048, 2048, 2048, 512] + - [23, 19264.0] + - - [2048, 3955, 1, 512, 2048, 2048, 2048, 512] + - [23, 19307.0] + - - [2048, 3968, 1, 512, 2048, 2048, 2048, 512] + - [23, 19470.0] + - - [2048, 3969, 1, 512, 2048, 2048, 2048, 512] + - [29, 18669.0] + - - [2048, 3976, 1, 512, 2048, 2048, 2048, 512] + - [29, 18723.0] + - - [2048, 3977, 1, 512, 2048, 2048, 2048, 512] + - [29, 18734.0] + - - [2048, 3978, 1, 512, 2048, 2048, 2048, 512] + - [29, 18722.0] + - - [2048, 3990, 1, 512, 2048, 2048, 2048, 512] + - [22, 18718.0] + - - [2048, 3995, 1, 512, 2048, 2048, 2048, 512] + - [29, 18817.0] + - - [2048, 3996, 1, 512, 2048, 2048, 2048, 512] + - [29, 18825.0] + - - [2048, 3999, 1, 512, 2048, 2048, 2048, 512] + - [29, 18811.0] + - - [2048, 4005, 1, 512, 2048, 2048, 2048, 512] + - [29, 18849.0] + - - [2048, 4012, 1, 512, 2048, 2048, 2048, 512] + - [29, 18892.0] + - - [2048, 4020, 1, 512, 2048, 2048, 2048, 512] + - [29, 18906.0] + - - [2048, 4026, 1, 512, 2048, 2048, 2048, 512] + - [29, 18932.0] + - - [2048, 4030, 1, 512, 2048, 2048, 2048, 512] + - [29, 18916.0] + - - [2048, 4032, 1, 512, 2048, 2048, 2048, 512] + - [29, 19034.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 18090.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18600.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19019.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19158.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19659.0] + - - [1024, 3968, 1, 42720, 1024, 1024, 1024, 42720] + - [30, 19493.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19097.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19381.0] + - - [1024, 7200, 1, 42720, 1024, 1024, 1024, 42720] + - [15, 19562.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 18954.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19279.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19544.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 19639.0] + - - [1024, 9520, 1, 42720, 1024, 1024, 1024, 42720] + - [30, 19931.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19782.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19920.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20161.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20162.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20010.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20207.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20145.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20171.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 17542.0] + - - [1024, 2048, 1, 30528, 1024, 1024, 1024, 30528] + - [21, 17820.0] + - - [1024, 4096, 1, 30528, 1024, 1024, 1024, 30528] + - [14, 18172.0] + - - [1024, 10240, 1, 256, 1024, 1024, 1024, 256] + - [7, 19092.0] + - - [1024, 10496, 1, 256, 1024, 1024, 1024, 256] + - [7, 19007.0] + - - [1024, 11008, 1, 256, 1024, 1024, 1024, 256] + - [7, 18997.0] + - - [1024, 11264, 1, 256, 1024, 1024, 1024, 256] + - [7, 19319.0] + - - [1024, 11520, 1, 256, 1024, 1024, 1024, 256] + - [4, 19551.0] + - - [1024, 12288, 1, 256, 1024, 1024, 1024, 256] + - [7, 19277.0] + - - [1024, 13312, 1, 256, 1024, 1024, 1024, 256] + - [7, 19248.0] + - - [1024, 13568, 1, 256, 1024, 1024, 1024, 256] + - [7, 19436.0] + - - [1024, 14336, 1, 256, 1024, 1024, 1024, 256] + - [4, 19701.0] + - - [1024, 14592, 1, 256, 1024, 1024, 1024, 256] + - [7, 19441.0] + - - [1024, 14848, 1, 256, 1024, 1024, 1024, 256] + - [7, 19642.0] + - - [1024, 15104, 1, 256, 1024, 1024, 1024, 256] + - [7, 19445.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 16838.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1] + - [6, 392.0] + - - [1024, 16128, 1, 256, 1024, 1024, 1024, 256] + - [7, 19729.0] + - - [1024, 17152, 1, 256, 1024, 1024, 1024, 256] + - [7, 19748.0] + - - [1024, 1792, 1, 256, 1024, 1024, 1024, 256] + - [29, 15669.0] + - - [1024, 18944, 1, 256, 1024, 1024, 1024, 256] + - [4, 19855.0] + - - [1024, 19712, 1, 256, 1024, 1024, 1024, 256] + - [7, 19670.0] + - - [1024, 19968, 1, 256, 1024, 1024, 1024, 256] + - [4, 19795.0] + - - [1024, 20480, 1, 256, 1024, 1024, 1024, 256] + - [4, 19757.0] + - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] + - [29, 15903.0] + - - [1024, 20992, 1, 256, 1024, 1024, 1024, 256] + - [7, 19776.0] + - - [1024, 21504, 1, 256, 1024, 1024, 1024, 256] + - [7, 19732.0] + - - [1024, 22016, 1, 256, 1024, 1024, 1024, 256] + - [7, 19760.0] + - - [1024, 23552, 1, 256, 1024, 1024, 1024, 256] + - [11, 19946.0] + - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] + - [29, 17467.0] + - - [1024, 28672, 1, 256, 1024, 1024, 1024, 256] + - [4, 19671.0] + - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] + - [29, 17476.0] + - - [1024, 3328, 1, 256, 1024, 1024, 1024, 256] + - [29, 17617.0] + - - [1024, 33536, 1, 256, 1024, 1024, 1024, 256] + - [19, 19464.0] + - - [1024, 3840, 1, 256, 1024, 1024, 1024, 256] + - [29, 17410.0] + - - [1024, 40448, 1, 256, 1024, 1024, 1024, 256] + - [19, 19541.0] + - - [1024, 4096, 1, 256, 1024, 1024, 1024, 256] + - [29, 17482.0] + - - [1024, 4608, 1, 256, 1024, 1024, 1024, 256] + - [4, 18527.0] + - - [1024, 4864, 1, 256, 1024, 1024, 1024, 256] + - [29, 18405.0] + - - [1024, 5120, 1, 256, 1024, 1024, 1024, 256] + - [4, 18502.0] + - - [1024, 5632, 1, 256, 1024, 1024, 1024, 256] + - [7, 18580.0] + - - [1024, 6144, 1, 256, 1024, 1024, 1024, 256] + - [7, 18547.0] + - - [1024, 6400, 1, 256, 1024, 1024, 1024, 256] + - [29, 18169.0] + - - [1024, 7168, 1, 256, 1024, 1024, 1024, 256] + - [29, 18690.0] + - - [1024, 7424, 1, 256, 1024, 1024, 1024, 256] + - [4, 19084.0] + - - [1024, 7680, 1, 256, 1024, 1024, 1024, 256] + - [29, 18624.0] + - - [1024, 7936, 1, 256, 1024, 1024, 1024, 256] + - [11, 19013.0] + - - [1024, 8192, 1, 256, 1024, 1024, 1024, 256] + - [7, 18619.0] + - - [1024, 8448, 1, 256, 1024, 1024, 1024, 256] + - [4, 19000.0] + - - [1024, 8704, 1, 256, 1024, 1024, 1024, 256] + - [7, 18635.0] + - - [1024, 8960, 1, 256, 1024, 1024, 1024, 256] + - [7, 18985.0] + - - [1024, 9728, 1, 256, 1024, 1024, 1024, 256] + - [7, 19375.0] + - - [1024, 9984, 1, 256, 1024, 1024, 1024, 256] + - [4, 18900.0] + - - [2048, 1024, 1, 1, 2048, 2048, 2048, 1] + - [6, 408.0] + - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] + - [14, 16016.0] + - - [256, 8976, 1, 10240, 256, 256, 256, 10240] + - [27, 18046.0] + - - [256, 8976, 1, 10496, 256, 256, 256, 10496] + - [13, 19236.0] + - - [256, 8976, 1, 11008, 256, 256, 256, 11008] + - [13, 19254.0] + - - [256, 8976, 1, 11520, 256, 256, 256, 11520] + - [13, 19261.0] + - - [256, 8976, 1, 12288, 256, 256, 256, 12288] + - [27, 18038.0] + - - [256, 8976, 1, 14336, 256, 256, 256, 14336] + - [27, 18044.0] + - - [256, 8976, 1, 14848, 256, 256, 256, 14848] + - [13, 19305.0] + - - [256, 8976, 1, 15104, 256, 256, 256, 15104] + - [28, 19311.0] + - - [256, 8976, 1, 1536, 256, 256, 256, 1536] + - [25, 18259.0] + - - [256, 8976, 1, 15872, 256, 256, 256, 15872] + - [13, 19322.0] + - - [256, 8976, 1, 17152, 256, 256, 256, 17152] + - [13, 19325.0] + - - [256, 8976, 1, 19712, 256, 256, 256, 19712] + - [40, 19350.0] + - - [256, 8976, 1, 19968, 256, 256, 256, 19968] + - [13, 19334.0] + - - [256, 8976, 1, 20480, 256, 256, 256, 20480] + - [12, 17652.0] + - - [256, 8976, 1, 2048, 256, 256, 256, 2048] + - [9, 18484.0] + - - [256, 8976, 1, 20992, 256, 256, 256, 20992] + - [13, 19351.0] + - - [256, 8976, 1, 22016, 256, 256, 256, 22016] + - [13, 19355.0] + - - [256, 8976, 1, 2304, 256, 256, 256, 2304] + - [9, 18531.0] + - - [256, 8976, 1, 2560, 256, 256, 256, 2560] + - [40, 18892.0] + - - [256, 8976, 1, 26112, 256, 256, 256, 26112] + - [13, 19366.0] + - - [256, 8976, 1, 2816, 256, 256, 256, 2816] + - [34, 18683.0] + - - [256, 8976, 1, 3072, 256, 256, 256, 3072] + - [40, 18993.0] + - - [256, 8976, 1, 33536, 256, 256, 256, 33536] + - [13, 19347.0] + - - [256, 8976, 1, 4352, 256, 256, 256, 4352] + - [13, 18910.0] + - - [256, 8976, 1, 44505, 256, 256, 256, 44505] + - [21, 19427.0] + - - [256, 8976, 1, 4864, 256, 256, 256, 4864] + - [13, 18987.0] + - - [256, 8976, 1, 5376, 256, 256, 256, 5376] + - [13, 19044.0] + - - [256, 8976, 1, 5632, 256, 256, 256, 5632] + - [40, 19060.0] + - - [256, 8976, 1, 5888, 256, 256, 256, 5888] + - [28, 19081.0] + - - [256, 8976, 1, 6144, 256, 256, 256, 6144] + - [11, 18215.0] + - - [256, 8976, 1, 6656, 256, 256, 256, 6656] + - [13, 19121.0] + - - [256, 8976, 1, 7168, 256, 256, 256, 7168] + - [27, 18132.0] + - - [256, 8976, 1, 7424, 256, 256, 256, 7424] + - [40, 19160.0] + - - [256, 8976, 1, 8192, 256, 256, 256, 8192] + - [39, 17408.0] + - - [256, 8976, 1, 8448, 256, 256, 256, 8448] + - [28, 19171.0] + - - [256, 8976, 1, 8960, 256, 256, 256, 8960] + - [40, 19200.0] + - - [256, 8976, 1, 9472, 256, 256, 256, 9472] + - [13, 19217.0] + - - [256, 8976, 1, 9728, 256, 256, 256, 9728] + - [40, 19231.0] + - - [256, 8976, 1, 9984, 256, 256, 256, 9984] + - [28, 19235.0] + - - [3200, 1024, 1, 2048, 3200, 3200, 3200, 2048] + - [23, 18260.0] + - - [4096, 1024, 1, 1, 4096, 4096, 4096, 1] + - [6, 484.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18107.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [35, 18292.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] + - [28, 17559.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 30528, 1024] + - [36, 20675.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 30528, 1024] + - [36, 20521.0] + - - [512, 32768, 1, 256, 512, 512, 512, 256] + - [36, 19528.0] + - - [256, 32768, 1, 128, 256, 256, 256, 128] + - [29, 18181.0] + - - [1024, 32768, 1, 512, 1024, 1024, 1024, 512] + - [11, 19908.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [27, 20218.0] + - - [479, 32768, 1, 1024, 479, 479, 479, 1024] + - [39, 18349.0] + - - [289, 128, 64, 768, 289, 289, 289, 768] + - [35, 13421.0] + - - [289, 160, 64, 768, 289, 289, 289, 768] + - [22, 11766.0] + - - [289, 192, 64, 768, 289, 289, 289, 768] + - [35, 14092.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 13797.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [29, 16371.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [39, 16102.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [7, 14293.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [7, 13183.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 64] + - [29, 18140.0] + - - [784, 512, 32, 128, 784, 784, 784, 128] + - [2, 16164.0] + - - [784, 128, 32, 512, 784, 784, 784, 512] + - [35, 15470.0] + - - [196, 1024, 32, 256, 196, 196, 196, 256] + - [35, 14041.0] + - - [256, 6912, 1, 4, 256, 256, 256, 4] + - [6, 1539.0] + - - [512, 4096, 1, 256, 512, 512, 512, 256] + - [36, 15818.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 512] + - [22, 17751.0] + - - [480, 4096, 1, 1024, 480, 480, 480, 1024] + - [23, 15763.0] + - - [512, 6912, 1, 256, 512, 512, 512, 256] + - [11, 18148.0] + - - [1024, 6912, 1, 512, 1024, 1024, 1024, 512] + - [23, 19652.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19909.0] + - - [480, 6912, 1, 1024, 480, 480, 480, 1024] + - [23, 18133.0] + - - [256, 55296, 1, 128, 256, 256, 256, 128] + - [30, 19021.0] + - - [512, 55296, 1, 256, 512, 512, 512, 256] + - [1, 19536.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [7, 18759.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 2880, 3072] + - [36, 19231.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [7, 20328.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [4, 20550.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [22, 18020.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [7, 20423.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [7, 20251.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [4, 20566.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [29, 16059.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [21, 19375.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [14, 18210.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [16, 19240.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [30, 19268.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [1, 19895.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [23, 19853.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [30, 20299.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [1, 20690.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [15, 20495.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [15, 20416.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [15, 20591.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [23, 20527.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [15, 20673.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [15, 20615.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [30, 20723.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [21, 20633.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 384] + - [29, 15221.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 384] + - [1, 18288.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 384] + - [29, 17887.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 384] + - [16, 18808.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 384] + - [1, 18856.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 384] + - [1, 19991.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 384] + - [30, 19590.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 384] + - [1, 19587.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 384] + - [15, 19957.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 384] + - [30, 20329.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 384] + - [15, 20142.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 384] + - [1, 20098.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 384] + - [15, 20209.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 384] + - [1, 20060.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 384] + - [1, 20248.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 384] + - [1, 20186.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 384] + - [1, 20286.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 384] + - [1, 20368.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 384] + - [1, 20415.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 384] + - [1, 20454.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 384] + - [15, 20490.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 384] + - [27, 20180.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 384] + - [1, 20518.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 384] + - [1, 20523.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 384] + - [1, 20564.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 384] + - [1, 20549.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 384] + - [1, 20626.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 384] + - [7, 20600.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 384] + - [1, 20634.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 384] + - [1, 20493.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 384] + - [1, 20631.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 384] + - [1, 20626.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 384] + - [15, 20657.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 384] + - [23, 20536.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 384] + - [7, 20694.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 384] + - [1, 20645.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 384] + - [1, 20678.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 384] + - [1, 20608.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 384] + - [1, 20720.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 384] + - [1, 20673.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 384] + - [15, 20721.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 384] + - [1, 20696.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 384] + - [1, 20727.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 384] + - [1, 20714.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 384] + - [15, 20746.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 384] + - [7, 20571.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 384] + - [15, 20749.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 384] + - [1, 20716.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 384] + - [1, 20734.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 384] + - [1, 20710.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 384] + - [15, 20760.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 384] + - [1, 20730.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 384] + - [15, 20771.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 384] + - [7, 20687.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 384] + - [1, 20737.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 384] + - [15, 20743.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 384] + - [15, 20779.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 384] + - [23, 20673.0] + - - [8192, 1024, 1, 1024, 8192, 8192, 8192, 1024] + - [7, 19167.0] + - - [8192, 4096, 1, 1024, 8192, 8192, 8192, 1024] + - [1, 20368.0] + - - [16384, 16384, 1, 16384, 16384, 16384, 16384, 16384] + - [21, 19166.0] + - - [1444, 256, 120, 128, 1444, 1444, 1444, 128] + - [0, 15820.0] + - - [1444, 256, 139, 128, 1444, 1444, 1444, 128] + - [3, 13333.0] + - - [1444, 256, 160, 128, 1444, 1444, 1444, 128] + - [3, 13344.0] + - - [1444, 256, 18, 128, 1444, 1444, 1444, 128] + - [6, 16596.0] + - - [1444, 256, 19, 128, 1444, 1444, 1444, 128] + - [35, 16829.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [4, 17858.0] + - - [1444, 256, 139, 256, 1444, 1444, 1444, 256] + - [4, 17920.0] + - - [1444, 256, 160, 256, 1444, 1444, 1444, 256] + - [4, 17836.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [31, 17590.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [7, 17562.0] + - - [361, 256, 120, 512, 361, 361, 361, 512] + - [40, 16216.0] + - - [361, 256, 139, 512, 361, 361, 361, 512] + - [12, 15850.0] + - - [361, 256, 160, 512, 361, 361, 361, 512] + - [11, 15865.0] + - - [361, 256, 18, 512, 361, 361, 361, 512] + - [35, 15991.0] + - - [361, 256, 19, 512, 361, 361, 361, 512] + - [35, 15099.0] + - - [173280, 128, 1, 64, 173280, 173280, 173280, 64] + - [35, 17928.0] + - - [200716, 128, 1, 64, 200716, 200716, 200716, 64] + - [0, 14623.0] + - - [231040, 128, 1, 64, 231040, 231040, 231040, 64] + - [7, 10202.0] + - - [25992, 128, 1, 64, 25992, 25992, 25992, 64] + - [6, 12569.0] + - - [27436, 128, 1, 64, 27436, 27436, 27436, 64] + - [16, 14261.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [4, 20550.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [7, 20260.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 18838.0] + - - [1024, 1280, 1, 2, 1024, 1024, 1024, 2] + - [6, 690.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 16128.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19558.0] + - - [1024, 4992, 1, 2, 1024, 1024, 1024, 2] + - [0, 1083.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19400.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20341.0] + - - [1024, 5120, 1, 2, 1024, 1024, 1024, 2] + - [6, 1034.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19412.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 19626.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20240.0] + - - [1024, 5248, 1, 2, 1024, 1024, 1024, 2] + - [6, 1031.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 18218.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 18404.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20227.0] + - - [1024, 2560, 1, 2, 1024, 1024, 1024, 2] + - [6, 868.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 18377.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20003.0] + - - [1024, 1152, 1, 2, 1024, 1024, 1024, 2] + - [35, 678.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 1024, 4096] + - [24, 18214.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 19658.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19204.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19343.0] + - - [1024, 8192, 1, 33712, 1024, 1024, 1024, 33712] + - [1, 19442.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19795.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 19787.0] + - - [1024, 9600, 1, 33712, 1024, 1024, 1024, 33712] + - [30, 20102.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20370.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20299.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19567.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19748.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19722.0] + - - [1024, 10080, 1, 42720, 1024, 1024, 1024, 42720] + - [21, 19995.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 18842.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19113.0] + - - [1024, 6528, 1, 42720, 1024, 1024, 1024, 42720] + - [21, 19357.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 18965.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19300.0] + - - [1024, 7104, 1, 42720, 1024, 1024, 1024, 42720] + - [15, 19319.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19982.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 20060.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 20110.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 20218.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20208.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20150.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20097.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20048.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20355.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 20289.0] + - - [480, 32768, 1, 1024, 480, 480, 480, 1024] + - [27, 18445.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [22, 17574.0] + - - [2048, 1024, 1, 30592, 2048, 2048, 2048, 30592] + - [34, 17825.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 2048, 6144] + - [28, 17626.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 2048, 8192] + - [13, 17708.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 8192, 2048] + - [7, 19333.0] + - - [1024, 8192, 1, 30592, 1024, 1024, 1024, 30592] + - [7, 19195.0] + - - [1024, 8192, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 19394.0] + - - [512, 512, 256, 64, 512, 512, 512, 64] + - [0, 12670.0] + - - [1024, 2048, 1, 30592, 1024, 1024, 1024, 30592] + - [21, 17816.0] + - - [1024, 4096, 1, 30592, 1024, 1024, 1024, 30592] + - [5, 18128.0] + - - [512, 512, 128, 64, 512, 512, 512, 64] + - [0, 17369.0] + - - [2560, 2048, 1, 1920, 2560, 2560, 2560, 1920] + - [30, 19858.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [23, 19866.0] + - - [2560, 2048, 1, 7680, 2560, 2560, 2560, 7680] + - [36, 19646.0] + - - [640, 2048, 1, 2560, 640, 640, 640, 2560] + - [35, 16003.0] + - - [512, 512, 40, 64, 512, 512, 512, 64] + - [0, 17982.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [36, 19553.0] + - - [1536, 4096, 1, 4608, 1536, 1536, 1536, 4608] + - [23, 19681.0] + - - [1536, 4096, 1, 50304, 1536, 1536, 1536, 50304] + - [4, 19437.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 1536, 6144] + - [23, 19635.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 20487.0] + - - [1024, 1024, 64, 96, 1024, 1024, 1024, 96] + - [0, 15461.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [36, 19871.0] + - - [1536, 8192, 1, 4608, 1536, 1536, 1536, 4608] + - [36, 19939.0] + - - [1536, 8192, 1, 50304, 1536, 1536, 1536, 50304] + - [34, 19975.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 1536, 6144] + - [40, 19902.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 20652.0] + - - [1024, 1024, 128, 96, 1024, 1024, 1024, 96] + - [0, 15198.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 20084.0] + - - [1024, 16384, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 20101.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 20092.0] + - - [1024, 16384, 1, 50304, 1024, 1024, 1024, 50304] + - [4, 20068.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20602.0] + - - [1024, 1024, 256, 64, 1024, 1024, 1024, 64] + - [10, 11016.0] + - - [1024, 2048, 1, 50304, 1024, 1024, 1024, 50304] + - [34, 17818.0] + - - [1024, 1024, 32, 64, 1024, 1024, 1024, 64] + - [0, 17977.0] + - - [1024, 4096, 1, 50304, 1024, 1024, 1024, 50304] + - [21, 18133.0] + - - [1024, 1024, 64, 64, 1024, 1024, 1024, 64] + - [0, 14980.0] + - - [1024, 8192, 1, 50304, 1024, 1024, 1024, 50304] + - [33, 19151.0] + - - [1024, 1024, 128, 64, 1024, 1024, 1024, 64] + - [0, 12336.0] + - - [128, 128, 1024, 64, 128, 128, 128, 64] + - [20, 16229.0] + - - [1024, 8192, 1, 30528, 1024, 1024, 1024, 30528] + - [15, 19385.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19125.0] + - - [1024, 3456, 1, 512, 1024, 1024, 1024, 512] + - [7, 18827.0] + - - [256, 6912, 1, 128, 256, 256, 256, 128] + - [0, 15685.0] + - - [480, 3456, 1, 1024, 480, 480, 480, 1024] + - [24, 16822.0] + - - [512, 3456, 1, 256, 512, 512, 512, 256] + - [35, 16902.0] + - - [1024, 1280, 1, 30528, 1024, 1024, 1024, 30528] + - [14, 16316.0] + - - [1024, 1600, 1, 30528, 1024, 1024, 1024, 30528] + - [2, 17538.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19932.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 19922.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20308.0] + - - [128, 128, 1280, 64, 128, 128, 128, 64] + - [23, 7233.0] + - - [1024, 1640, 1, 30528, 1024, 1024, 1024, 30528] + - [37, 17952.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19508.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19598.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20441.0] + - - [128, 128, 1312, 64, 128, 128, 128, 64] + - [36, 6844.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19552.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20277.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19289.0] + - - [512, 512, 192, 64, 512, 512, 512, 64] + - [0, 13616.0] + - - [256, 6912, 1, 1, 256, 256, 256, 1] + - [6, 375.0] + - - [3136, 128, 64, 64, 3136, 3136, 3136, 64] + - [36, 11021.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 128] + - [18, 14802.0] + - - [784, 512, 64, 256, 784, 784, 784, 256] + - [4, 16778.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 256] + - [32, 16560.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [19, 19247.0] + - - [196, 1024, 64, 512, 196, 196, 196, 512] + - [27, 14455.0] + - - [784, 256, 64, 512, 784, 784, 784, 512] + - [4, 16670.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [4, 17215.0] + - - [196, 512, 64, 1024, 196, 196, 196, 1024] + - [27, 13620.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [27, 14323.0] + - - [3136, 128, 32, 64, 3136, 3136, 3136, 64] + - [14, 17307.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 128] + - [0, 18549.0] + - - [784, 512, 32, 256, 784, 784, 784, 256] + - [36, 16860.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 256] + - [4, 17879.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [19, 18779.0] + - - [196, 1024, 32, 512, 196, 196, 196, 512] + - [36, 14291.0] + - - [784, 256, 32, 512, 784, 784, 784, 512] + - [23, 16460.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [36, 17447.0] + - - [196, 512, 32, 1024, 196, 196, 196, 1024] + - [35, 13615.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [27, 14138.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19825.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19873.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19897.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19932.0] + - - [1024, 10224, 1, 3072, 1024, 1024, 1024, 3072] + - [28, 19874.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20211.0] + - - [1024, 10240, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 19931.0] + - - [1024, 10192, 1, 3072, 1024, 1024, 1024, 3072] + - [28, 19857.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20246.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19899.0] + - - [1024, 10200, 1, 3072, 1024, 1024, 1024, 3072] + - [28, 19862.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19736.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20219.0] + - - [1024, 10208, 1, 3072, 1024, 1024, 1024, 3072] + - [28, 19845.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 19913.0] + - - [1024, 10224, 1, 2048, 1024, 1024, 1024, 2048] + - [7, 20014.0] + - - [1024, 10240, 1, 2048, 1024, 1024, 1024, 2048] + - [7, 20025.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19607.0] + - - [1024, 10192, 1, 2048, 1024, 1024, 1024, 2048] + - [7, 19967.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19680.0] + - - [1024, 10080, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 19797.0] + - - [100352, 512, 1, 256, 100352, 100352, 100352, 256] + - [27, 19862.0] + - - [12544, 2048, 1, 1024, 12544, 12544, 12544, 1024] + - [36, 20355.0] + - - [200704, 512, 1, 256, 200704, 200704, 200704, 256] + - [27, 20102.0] + - - [25088, 1024, 1, 512, 25088, 25088, 25088, 512] + - [1, 20228.0] + - - [50176, 1024, 1, 512, 50176, 50176, 50176, 512] + - [7, 20215.0] + - - [6272, 2048, 1, 1024, 6272, 6272, 6272, 1024] + - [7, 20148.0] + - - [196, 1024, 128, 256, 196, 196, 196, 256] + - [10, 13615.0] + - - [196, 1024, 256, 256, 196, 196, 196, 256] + - [10, 13736.0] + - - [196, 256, 128, 1024, 196, 196, 196, 1024] + - [38, 12752.0] + - - [196, 256, 256, 1024, 196, 196, 196, 1024] + - [26, 13274.0] + - - [196, 512, 128, 1024, 196, 196, 196, 1024] + - [39, 14008.0] + - - [196, 512, 256, 1024, 196, 196, 196, 1024] + - [39, 14249.0] + - - [3136, 128, 128, 256, 3136, 3136, 3136, 256] + - [32, 16249.0] + - - [3136, 128, 256, 256, 3136, 3136, 3136, 256] + - [32, 16034.0] + - - [784, 256, 128, 512, 784, 784, 784, 512] + - [11, 17065.0] + - - [784, 256, 256, 512, 784, 784, 784, 512] + - [11, 17443.0] + - - [128, 128, 2048, 64, 128, 128, 128, 64] + - [0, 6907.0] + - - [1024, 2560, 1, 30528, 1024, 1024, 1024, 30528] + - [14, 18287.0] + - - [128, 128, 1536, 64, 128, 128, 128, 64] + - [0, 8019.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 19853.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 19703.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20497.0] + - - [1024, 1920, 1, 30528, 1024, 1024, 1024, 30528] + - [14, 17769.0] + - - [128, 128, 192, 64, 128, 128, 128, 64] + - [14, 12599.0] + - - [384, 384, 144, 64, 384, 384, 384, 64] + - [0, 18570.0] + - - [768, 4608, 1, 2, 768, 768, 768, 2] + - [29, 900.0] + - - [3072, 4608, 1, 768, 3072, 3072, 3072, 768] + - [1, 20298.0] + - - [768, 4608, 1, 3072, 768, 768, 768, 3072] + - [40, 19831.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [36, 19284.0] + - - [512, 512, 48, 64, 512, 512, 512, 64] + - [29, 18187.0] + - - [128, 128, 256, 64, 128, 128, 128, 64] + - [0, 14340.0] + - - [384, 384, 192, 64, 384, 384, 384, 64] + - [0, 17919.0] + - - [1024, 4608, 1, 2, 1024, 1024, 1024, 2] + - [6, 989.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 4096, 1024] + - [7, 20435.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 20046.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 19508.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 1024] + - [31, 16295.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 1024] + - [24, 15040.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 1024] + - [8, 16858.0] + - - [850, 2048, 2, 512, 850, 850, 850, 512] + - [35, 16766.0] + - - [768, 2048, 2, 512, 768, 768, 768, 512] + - [35, 17876.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 1024] + - [8, 15413.0] + - - [805, 2048, 2, 512, 805, 805, 805, 512] + - [35, 15846.0] + - - [864, 2048, 2, 512, 864, 864, 864, 512] + - [22, 17046.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 1024] + - [7, 15861.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 1024] + - [8, 17905.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 1024] + - [5, 16464.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 512] + - [7, 17846.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 512] + - [29, 17943.0] + - - [888, 2048, 2, 512, 888, 888, 888, 512] + - [22, 17483.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 512] + - [23, 18320.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 512] + - [1, 17401.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 1024] + - [16, 18410.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 1024] + - [23, 15689.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 1024] + - [22, 16514.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 512] + - [1, 19166.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 1024] + - [8, 16085.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 1024] + - [8, 16686.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 1024] + - [8, 17762.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 1024] + - [6, 16054.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 512] + - [14, 16816.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 1024] + - [7, 17235.0] + - - [840, 2048, 2, 512, 840, 840, 840, 512] + - [35, 16578.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 512] + - [29, 17749.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 512] + - [29, 17197.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 1024] + - [8, 17123.0] + - - [713, 2048, 2, 512, 713, 713, 713, 512] + - [22, 16439.0] + - - [13600, 256, 2, 512, 13600, 13600, 13600, 512] + - [36, 19332.0] + - - [12880, 256, 2, 512, 12880, 12880, 12880, 512] + - [29, 18368.0] + - - [12288, 256, 2, 512, 12288, 12288, 12288, 512] + - [1, 19259.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 1024] + - [6, 17207.0] + - - [850, 2048, 1, 512, 850, 850, 850, 512] + - [6, 14965.0] + - - [660, 2048, 2, 512, 660, 660, 660, 512] + - [22, 15234.0] + - - [672, 2048, 2, 512, 672, 672, 672, 512] + - [22, 15565.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 512] + - [1, 18720.0] + - - [726, 2048, 2, 512, 726, 726, 726, 512] + - [35, 16764.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 1024] + - [6, 15762.0] + - - [13824, 256, 2, 512, 13824, 13824, 13824, 512] + - [1, 19944.0] + - - [15200, 256, 2, 512, 15200, 15200, 15200, 512] + - [1, 18732.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 1024] + - [35, 16540.0] + - - [748, 2048, 2, 512, 748, 748, 748, 512] + - [22, 17227.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 1024] + - [22, 16154.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 256] + - [7, 18363.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 512] + - [31, 18767.0] + - - [15200, 128, 1, 512, 15200, 15200, 15200, 512] + - [0, 16770.0] + - - [13600, 128, 1, 512, 13600, 13600, 13600, 512] + - [2, 17266.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 256] + - [0, 17921.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 256] + - [1, 18338.0] + - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] + - [22, 18058.0] + - - [24576, 128, 1, 256, 24576, 24576, 24576, 256] + - [0, 17568.0] + - - [24576, 512, 1, 256, 24576, 24576, 24576, 256] + - [1, 19273.0] + - - [25760, 128, 1, 256, 25760, 25760, 25760, 256] + - [16, 16525.0] + - - [25760, 512, 1, 256, 25760, 25760, 25760, 256] + - [30, 19254.0] + - - [6144, 256, 1, 512, 6144, 6144, 6144, 512] + - [29, 16010.0] + - - [6440, 256, 1, 512, 6440, 6440, 6440, 512] + - [37, 16519.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 256] + - [7, 18183.0] + - - [13600, 512, 1, 128, 13600, 13600, 13600, 128] + - [7, 18072.0] + - - [9408, 512, 2, 128, 9408, 9408, 9408, 128] + - [0, 18355.0] + - - [56000, 256, 2, 64, 56000, 56000, 56000, 64] + - [0, 18040.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 256] + - [29, 17334.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 256] + - [30, 18788.0] + - - [60800, 256, 1, 64, 60800, 60800, 60800, 64] + - [29, 18454.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 256] + - [14, 18271.0] + - - [11776, 512, 2, 128, 11776, 11776, 11776, 128] + - [29, 18754.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 128] + - [29, 18221.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 256] + - [1, 18756.0] + - - [54400, 256, 1, 64, 54400, 54400, 54400, 64] + - [14, 18234.0] + - - [15200, 512, 1, 128, 15200, 15200, 15200, 128] + - [29, 17687.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 256] + - [29, 18345.0] + - - [12672, 512, 2, 128, 12672, 12672, 12672, 128] + - [30, 19215.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 128] + - [22, 18542.0] + - - [46464, 256, 2, 64, 46464, 46464, 46464, 64] + - [29, 18686.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 1024] + - [35, 14641.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 1024] + - [35, 15406.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 256] + - [14, 18074.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 512] + - [36, 17629.0] + - - [45632, 256, 2, 64, 45632, 45632, 45632, 64] + - [35, 18191.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 256] + - [1, 18401.0] + - - [53760, 256, 2, 64, 53760, 53760, 53760, 64] + - [0, 18266.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 1024] + - [35, 14352.0] + - - [47872, 256, 2, 64, 47872, 47872, 47872, 64] + - [35, 18472.0] + - - [47104, 256, 2, 64, 47104, 47104, 47104, 64] + - [35, 18472.0] + - - [50688, 256, 2, 64, 50688, 50688, 50688, 64] + - [35, 18541.0] + - - [45056, 256, 2, 64, 45056, 45056, 45056, 64] + - [35, 18441.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 128] + - [36, 18926.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 256] + - [6, 17526.0] + - - [11264, 512, 2, 128, 11264, 11264, 11264, 128] + - [1, 18741.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 512] + - [14, 16700.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 128] + - [1, 19200.0] + - - [37632, 256, 2, 64, 37632, 37632, 37632, 64] + - [29, 18399.0] + - - [51520, 256, 2, 64, 51520, 51520, 51520, 64] + - [35, 18204.0] + - - [14000, 512, 2, 128, 14000, 14000, 14000, 128] + - [1, 18461.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 128] + - [6, 18470.0] + - - [64512, 256, 2, 64, 64512, 64512, 64512, 64] + - [0, 17273.0] + - - [54400, 256, 2, 64, 54400, 54400, 54400, 64] + - [14, 18491.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 256] + - [1, 18109.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 128] + - [7, 18686.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 256] + - [1, 17906.0] + - - [950, 2048, 1, 512, 950, 950, 950, 512] + - [7, 15079.0] + - - [55296, 256, 2, 256, 55296, 55296, 55296, 256] + - [39, 19461.0] + - - [51520, 256, 2, 256, 51520, 51520, 51520, 256] + - [33, 19313.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 512] + - [16, 17925.0] + - - [60800, 256, 2, 256, 60800, 60800, 60800, 256] + - [33, 19690.0] + - - [54400, 256, 2, 256, 54400, 54400, 54400, 256] + - [33, 19412.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 256] + - [7, 18563.0] + - - [60800, 256, 2, 64, 60800, 60800, 60800, 64] + - [1, 16121.0] + - - [3800, 1024, 1, 256, 3800, 3800, 3800, 256] + - [6, 17104.0] + - - [3400, 1024, 1, 256, 3400, 3400, 3400, 256] + - [2, 17719.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 256] + - [1, 18772.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 256] + - [1, 18541.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 128] + - [7, 18680.0] + - - [49152, 256, 2, 256, 49152, 49152, 49152, 256] + - [39, 19033.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 128] + - [6, 18318.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 128] + - [30, 18626.0] + - - [42240, 256, 2, 64, 42240, 42240, 42240, 64] + - [22, 18465.0] + - - [1008, 2048, 2, 512, 1008, 1008, 1008, 512] + - [22, 17404.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 256] + - [1, 18630.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 128] + - [36, 19151.0] + - - [56832, 256, 2, 64, 56832, 56832, 56832, 64] + - [0, 17944.0] + - - [43008, 256, 2, 64, 43008, 43008, 43008, 64] + - [14, 18170.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 128] + - [0, 18538.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 256] + - [14, 18179.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 256] + - [0, 17897.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 128] + - [1, 19105.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 1024] + - [22, 16976.0] + - - [55296, 256, 2, 64, 55296, 55296, 55296, 64] + - [0, 17781.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 1024] + - [35, 16046.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 128] + - [1, 18960.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 256] + - [1, 18545.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 256] + - [7, 17960.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 256] + - [1, 19446.0] + - - [49152, 256, 2, 64, 49152, 49152, 49152, 64] + - [35, 18352.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 256] + - [36, 18392.0] + - - [950, 2048, 2, 512, 950, 950, 950, 512] + - [22, 16403.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 256] + - [6, 18198.0] + - - [1610, 2048, 1, 1024, 1610, 1610, 1610, 1024] + - [7, 17870.0] + - - [6912, 256, 1, 512, 6912, 6912, 6912, 512] + - [0, 17799.0] + - - [6800, 256, 1, 512, 6800, 6800, 6800, 512] + - [31, 17469.0] + - - [27648, 128, 1, 256, 27648, 27648, 27648, 256] + - [7, 18429.0] + - - [27200, 128, 1, 256, 27200, 27200, 27200, 256] + - [2, 17449.0] + - - [30400, 128, 1, 256, 30400, 30400, 30400, 256] + - [8, 17022.0] + - - [7600, 256, 1, 512, 7600, 7600, 7600, 512] + - [0, 16907.0] + - - [6144, 1024, 1, 512, 6144, 6144, 6144, 512] + - [1, 19243.0] + - - [6912, 1024, 1, 512, 6912, 6912, 6912, 512] + - [1, 19922.0] + - - [6440, 1024, 1, 512, 6440, 6440, 6440, 512] + - [7, 18412.0] + - - [27648, 512, 1, 256, 27648, 27648, 27648, 256] + - [1, 19855.0] + - - [1728, 2048, 1, 1024, 1728, 1728, 1728, 1024] + - [35, 17248.0] + - - [27200, 512, 1, 256, 27200, 27200, 27200, 256] + - [23, 19391.0] + - - [6800, 1024, 1, 512, 6800, 6800, 6800, 512] + - [36, 19263.0] + - - [1700, 2048, 1, 1024, 1700, 1700, 1700, 1024] + - [22, 16996.0] + - - [7600, 1024, 1, 512, 7600, 7600, 7600, 512] + - [1, 18709.0] + - - [30400, 512, 1, 256, 30400, 30400, 30400, 256] + - [30, 19392.0] + - - [1900, 2048, 1, 1024, 1900, 1900, 1900, 1024] + - [7, 18320.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [7, 20212.0] + - - [1024, 1024, 160, 96, 1024, 1024, 1024, 96] + - [0, 15577.0] + - - [1920, 16384, 1, 25216, 1920, 1920, 1920, 25216] + - [15, 20419.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 3840, 1920] + - [1, 20820.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 1920, 3840] + - [23, 20481.0] + - - [960, 16384, 1, 1920, 960, 960, 960, 1920] + - [1, 18944.0] + - - [1920, 16384, 1, 2880, 1920, 1920, 1920, 2880] + - [15, 20579.0] + - - [1024, 1024, 40, 96, 1024, 1024, 1024, 96] + - [0, 18943.0] + - - [1920, 4096, 1, 25216, 1920, 1920, 1920, 25216] + - [21, 19521.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 3840, 1920] + - [15, 20370.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 1920, 3840] + - [34, 19406.0] + - - [960, 4096, 1, 1920, 960, 960, 960, 1920] + - [0, 16905.0] + - - [1920, 4096, 1, 2880, 1920, 1920, 1920, 2880] + - [15, 19488.0] + - - [1024, 1024, 80, 96, 1024, 1024, 1024, 96] + - [0, 17042.0] + - - [1920, 8192, 1, 25216, 1920, 1920, 1920, 25216] + - [30, 20302.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 3840, 1920] + - [1, 20567.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 1920, 3840] + - [23, 20240.0] + - - [960, 8192, 1, 1920, 960, 960, 960, 1920] + - [1, 18158.0] + - - [1920, 8192, 1, 2880, 1920, 1920, 1920, 2880] + - [30, 20424.0] + - - [1024, 1024, 96, 96, 1024, 1024, 1024, 96] + - [0, 16137.0] + - - [2304, 16384, 1, 12672, 2304, 2304, 2304, 12672] + - [30, 20788.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 20752.0] + - - [576, 16384, 1, 2304, 576, 576, 576, 2304] + - [36, 18039.0] + - - [2304, 16384, 1, 1728, 2304, 2304, 2304, 1728] + - [15, 20802.0] + - - [1024, 1024, 24, 96, 1024, 1024, 1024, 96] + - [7, 18960.0] + - - [2304, 4096, 1, 12672, 2304, 2304, 2304, 12672] + - [21, 20538.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 20427.0] + - - [576, 4096, 1, 2304, 576, 576, 576, 2304] + - [35, 16478.0] + - - [2304, 4096, 1, 1728, 2304, 2304, 2304, 1728] + - [30, 20490.0] + - - [1024, 1024, 48, 96, 1024, 1024, 1024, 96] + - [0, 19033.0] + - - [2304, 8192, 1, 12672, 2304, 2304, 2304, 12672] + - [15, 20660.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 20627.0] + - - [576, 8192, 1, 2304, 576, 576, 576, 2304] + - [7, 17833.0] + - - [2304, 8192, 1, 1728, 2304, 2304, 2304, 1728] + - [30, 20693.0] + - - [1024, 1024, 16, 96, 1024, 1024, 1024, 96] + - [35, 18715.0] + - - [3072, 4096, 1, 6400, 3072, 3072, 3072, 6400] + - [1, 19978.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 1536, 3072] + - [36, 19626.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 3072, 1536] + - [7, 19833.0] + - - [384, 4096, 1, 3072, 384, 384, 384, 3072] + - [37, 16667.0] + - - [3072, 4096, 1, 1152, 3072, 3072, 3072, 1152] + - [1, 19907.0] + - - [1024, 1024, 32, 96, 1024, 1024, 1024, 96] + - [0, 19018.0] + - - [3072, 8192, 1, 6400, 3072, 3072, 3072, 6400] + - [15, 20639.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 1536, 3072] + - [23, 19861.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 3072, 1536] + - [1, 20435.0] + - - [384, 8192, 1, 3072, 384, 384, 384, 3072] + - [22, 18263.0] + - - [3072, 8192, 1, 1152, 3072, 3072, 3072, 1152] + - [1, 20538.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 19307.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [7, 19387.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 4096, 2048] + - [7, 20271.0] + - - [1024, 2283, 1, 29000, 1024, 1024, 1024, 29000] + - [21, 19758.0] + - - [1024, 2296, 1, 29000, 1024, 1024, 1024, 29000] + - [21, 19859.0] + - - [1024, 2306, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16661.0] + - - [1024, 2309, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16675.0] + - - [1024, 2318, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 16652.0] + - - [1024, 2320, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16778.0] + - - [1024, 2324, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 16697.0] + - - [1024, 2325, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16811.0] + - - [1024, 2329, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16811.0] + - - [1024, 2338, 1, 29000, 1024, 1024, 1024, 29000] + - [14, 16851.0] + - - [1024, 2345, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16872.0] + - - [1024, 2350, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 16942.0] + - - [1024, 2362, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 17034.0] + - - [1024, 2366, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 17058.0] + - - [1024, 2368, 1, 29000, 1024, 1024, 1024, 29000] + - [29, 17078.0] + - - [1024, 2374, 1, 29000, 1024, 1024, 1024, 29000] + - [14, 17043.0] + - - [1024, 2390, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 17194.0] + - - [512, 512, 320, 64, 512, 512, 512, 64] + - [0, 11745.0] + - - [512, 512, 80, 64, 512, 512, 512, 64] + - [29, 18261.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [29, 18376.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 2560, 4096] + - [35, 18366.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 2560] + - [29, 18069.0] + - - [1024, 1024, 512, 64, 1024, 1024, 1024, 64] + - [10, 11048.0] + - - [1024, 32768, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 20474.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 20480.0] + - - [1024, 32768, 1, 50304, 1024, 1024, 1024, 50304] + - [5, 20019.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 20636.0] + - - [1024, 1024, 24, 128, 1024, 1024, 1024, 128] + - [7, 19319.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [10, 17305.0] + - - [768, 320, 1, 30522, 768, 768, 768, 30522] + - [45, 14933.0] + - - [768, 640, 1, 30522, 768, 768, 768, 30522] + - [44, 17346.0] + - - [768, 1280, 1, 30522, 768, 768, 768, 30522] + - [42, 18790.0] + - - [1024, 780, 1, 30522, 1024, 1024, 1024, 30522] + - [41, 16895.0] + - - [1024, 308, 1, 30522, 1024, 1024, 1024, 30522] + - [41, 15189.0] + - - [1024, 800, 1, 30522, 1024, 1024, 1024, 30522] + - [41, 17301.0] + - - [1024, 820, 1, 30522, 1024, 1024, 1024, 30522] + - [41, 17714.0] + - - [1024, 385, 1, 30522, 1024, 1024, 1024, 30522] + - [41, 14103.0] + - - [1024, 462, 1, 30522, 1024, 1024, 1024, 30522] + - [43, 15459.0] + - - [1024, 640, 1, 30528, 1024, 1024, 1024, 30528] + - [41, 17997.0] + - - [2048, 199, 1, 29000, 2048, 2048, 2048, 29000] + - [50, 13362.0] + - - [2048, 221, 1, 29000, 2048, 2048, 2048, 29000] + - [48, 14728.0] + - - [2048, 224, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 14916.0] + - - [2048, 229, 1, 29000, 2048, 2048, 2048, 29000] + - [51, 15324.0] + - - [2048, 234, 1, 29000, 2048, 2048, 2048, 29000] + - [51, 15627.0] + - - [2048, 242, 1, 29000, 2048, 2048, 2048, 29000] + - [51, 16153.0] + - - [2048, 246, 1, 29000, 2048, 2048, 2048, 29000] + - [43, 16322.0] + - - [2048, 247, 1, 29000, 2048, 2048, 2048, 29000] + - [43, 16495.0] + - - [2048, 256, 1, 29000, 2048, 2048, 2048, 29000] + - [50, 17074.0] + - - [2048, 262, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 14545.0] + - - [2048, 264, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 14663.0] + - - [2048, 265, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 14717.0] + - - [2048, 274, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 15173.0] + - - [2048, 277, 1, 29000, 2048, 2048, 2048, 29000] + - [49, 15283.0] + - - [2048, 279, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 15493.0] + - - [2048, 288, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 15913.0] + - - [2048, 296, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 16362.0] + - - [2048, 315, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 17415.0] + - - [2048, 335, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 15590.0] + - - [1024, 561, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17854.0] + - - [1024, 574, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 18251.0] + - - [1024, 600, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 16932.0] + - - [1024, 608, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17133.0] + - - [1024, 615, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17301.0] + - - [1024, 622, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17508.0] + - - [1024, 625, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17567.0] + - - [1024, 626, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17611.0] + - - [1024, 628, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17679.0] + - - [1024, 636, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17878.0] + - - [1024, 651, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 16769.0] + - - [1024, 658, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 16997.0] + - - [1024, 669, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17232.0] + - - [1024, 670, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17263.0] + - - [1024, 672, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17323.0] + - - [1024, 684, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17590.0] + - - [1024, 716, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 16893.0] + - - [1024, 730, 1, 29000, 1024, 1024, 1024, 29000] + - [41, 17220.0] + - - [1600, 512, 1, 1024, 1600, 1600, 1600, 1024] + - [83, 14665.0] + - - [1024, 512, 1, 1, 1024, 1024, 1024, 1] + - [94, 213.0] + - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] + - [53, 6765.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 1] + - [94, 343.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [98, 12693.0] + - - [768, 1024, 1, 2, 768, 768, 768, 2] + - [94, 610.0] + - - [768, 1024, 1, 768, 768, 768, 768, 768] + - [98, 14796.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [64, 14677.0] + - - [768, 512, 1, 2, 768, 768, 768, 2] + - [94, 371.0] + - - [768, 512, 1, 768, 768, 768, 768, 768] + - [97, 10471.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 13571.0] + - - [1024, 512, 1, 2, 1024, 1024, 1024, 2] + - [94, 460.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [78, 9977.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [55, 5853.0] + - - [704, 1024, 1, 128, 704, 704, 704, 128] + - [74, 10185.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [57, 16301.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [56, 16253.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [107, 13500.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 128] + - [54, 12435.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [64, 16974.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [98, 16065.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [75, 12043.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [82, 10102.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [83, 15772.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [98, 9350.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [55, 7334.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [62, 7642.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [63, 12643.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [106, 12628.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [56, 14426.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [55, 7584.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [55, 10503.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [98, 14887.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [75, 12278.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [57, 15515.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [55, 9922.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [55, 10613.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [75, 9632.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [75, 12727.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [64, 15355.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 128] + - [98, 10913.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [105, 8581.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [65, 13663.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [63, 10246.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [62, 9080.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [98, 15031.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 128] + - [54, 8133.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [76, 14233.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [105, 8419.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [55, 8900.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [63, 12077.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [63, 9494.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 128] + - [63, 11249.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [61, 9754.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [74, 8818.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [75, 12311.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [105, 10347.0] + - - [448, 1856, 1, 128, 448, 448, 448, 128] + - [54, 11543.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [56, 13897.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [63, 12610.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [56, 13533.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [64, 14973.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [99, 14469.0] + - - [704, 1856, 1, 128, 704, 704, 704, 128] + - [54, 13026.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [83, 12531.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 128] + - [75, 13883.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [93, 9368.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [82, 10099.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [64, 13723.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [56, 10021.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [56, 12420.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [104, 11770.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [63, 12480.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [56, 14848.0] + - - [448, 2368, 1, 128, 448, 448, 448, 128] + - [54, 12504.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [98, 13512.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [64, 13958.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [66, 8302.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [106, 14810.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [76, 13752.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [81, 10417.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [97, 8689.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [56, 12659.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [55, 9050.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [76, 15967.0] + - - [448, 1024, 1, 128, 448, 448, 448, 128] + - [54, 8044.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [58, 14354.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 128] + - [55, 6680.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [98, 12577.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [64, 13232.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [54, 10170.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [83, 14424.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [64, 16554.0] + - - [256, 1856, 1, 128, 256, 256, 256, 128] + - [98, 8470.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [98, 13510.0] + - - [448, 1408, 1, 128, 448, 448, 448, 128] + - [54, 9011.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [75, 12201.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [55, 7602.0] + - - [704, 448, 1, 128, 704, 704, 704, 128] + - [55, 6618.0] + - - [704, 1408, 1, 128, 704, 704, 704, 128] + - [54, 11792.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [107, 14222.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [55, 7584.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [63, 11651.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [74, 8886.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [58, 14165.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [65, 10578.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [75, 14550.0] + - - [256, 2368, 1, 128, 256, 256, 256, 128] + - [63, 8939.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [56, 14581.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [56, 15726.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [98, 10648.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [76, 14401.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [83, 14368.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [56, 11739.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [75, 14993.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 128] + - [54, 11696.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [55, 8423.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [61, 8590.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [62, 8742.0] + - - [3025, 64, 64, 64, 3025, 3025, 3025, 64] + - [75, 16085.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [75, 11537.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [75, 15558.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [63, 9902.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [55, 8723.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [99, 16613.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [56, 14999.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [76, 15379.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 128] + - [55, 7442.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [82, 9808.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [75, 12049.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [105, 10581.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [75, 16288.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [56, 12561.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [55, 7785.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [83, 14625.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [63, 12117.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 128] + - [56, 10438.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [83, 12776.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 128] + - [56, 12614.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [54, 11593.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 128] + - [75, 9323.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [75, 12601.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [105, 8802.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [63, 10667.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [81, 13091.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [57, 12691.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [56, 14834.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [99, 15140.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [104, 13747.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [104, 13445.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [104, 10368.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [105, 8573.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [97, 9094.0] + - - [64, 5888, 1, 128, 64, 64, 64, 128] + - [62, 7537.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [56, 12680.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [97, 7532.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [76, 10565.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [63, 12472.0] + - - [704, 704, 1, 128, 704, 704, 704, 128] + - [61, 8595.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [55, 6402.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [55, 9265.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [83, 12516.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [54, 10237.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [56, 11993.0] + - - [256, 3584, 1, 128, 256, 256, 256, 128] + - [98, 11402.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 128] + - [54, 12844.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [85, 14391.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 128] + - [74, 8738.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [63, 10239.0] + - - [256, 2944, 1, 128, 256, 256, 256, 128] + - [98, 10839.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [74, 10138.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [75, 13647.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [76, 17203.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 128] + - [96, 13211.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [54, 9418.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [75, 11720.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [54, 8111.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [63, 10318.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [98, 16118.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [56, 12098.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [61, 14363.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 128] + - [54, 8308.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [82, 7225.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [55, 10289.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [56, 12274.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [63, 10928.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [98, 15826.0] + - - [448, 2944, 1, 128, 448, 448, 448, 128] + - [54, 13107.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [55, 8592.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [98, 15928.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [63, 14740.0] + - - [64, 5056, 1, 128, 64, 64, 64, 128] + - [105, 6554.0] + - - [64, 6784, 1, 128, 64, 64, 64, 128] + - [62, 7410.0] + - - [448, 704, 1, 128, 448, 448, 448, 128] + - [55, 6596.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [56, 10794.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 128] + - [75, 12223.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [98, 11943.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [56, 14507.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [64, 13954.0] + - - [256, 1408, 1, 128, 256, 256, 256, 128] + - [55, 7187.0] + - - [256, 4288, 1, 128, 256, 256, 256, 128] + - [98, 13132.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [56, 10695.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [98, 12528.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [56, 13822.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [56, 13019.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [110, 9705.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 128] + - [75, 7740.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [56, 12957.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [105, 8372.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [76, 14153.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [106, 15029.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [56, 12269.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [56, 12506.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [63, 14759.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [56, 10055.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [55, 10324.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [55, 6319.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [63, 14485.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [105, 9156.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [56, 14188.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [63, 11913.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [56, 13138.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [98, 10983.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [75, 8066.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [64, 13107.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [60, 14389.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 128] + - [62, 7353.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [54, 10744.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [76, 13479.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [75, 13221.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [56, 14231.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [63, 14408.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [97, 8868.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [75, 16096.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [63, 14852.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [52, 9068.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [66, 10170.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [52, 6203.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [87, 10198.0] + - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] + - [83, 13769.0] + - - [4096, 256, 1, 2048, 4096, 4096, 4096, 2048] + - [64, 16249.0] + - - [2048, 256, 1, 4096, 2048, 2048, 2048, 4096] + - [80, 14395.0] + - - [512, 768, 1, 2048, 512, 512, 512, 2048] + - [105, 10839.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 1024] + - [75, 13626.0] + - - [2048, 200, 1, 512, 2048, 2048, 2048, 512] + - [98, 9781.0] + - - [4096, 200, 1, 1024, 4096, 4096, 4096, 1024] + - [99, 12154.0] + - - [2048, 200, 1, 4096, 2048, 2048, 2048, 4096] + - [75, 11269.0] + - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] + - [107, 15795.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] + - [76, 15140.0] + - - [2048, 512, 1, 4096, 2048, 2048, 2048, 4096] + - [57, 16260.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 4096] + - [64, 16190.0] + - - [4096, 200, 1, 2048, 4096, 4096, 4096, 2048] + - [76, 12563.0] + - - [2048, 200, 1, 1024, 2048, 2048, 2048, 1024] + - [98, 10624.0] + - - [1024, 768, 1, 512, 1024, 1024, 1024, 512] + - [98, 14188.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [98, 11084.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [98, 14128.0] + - - [512, 768, 1, 512, 512, 512, 512, 512] + - [105, 9996.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [107, 16356.0] + - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] + - [63, 12770.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] + - [64, 16067.0] + - - [4096, 256, 1, 1024, 4096, 4096, 4096, 1024] + - [64, 15855.0] + - - [512, 768, 1, 1024, 512, 512, 512, 1024] + - [82, 10574.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 4096] + - [80, 14428.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [76, 12831.0] + - - [2048, 256, 1, 512, 2048, 2048, 2048, 512] + - [56, 12943.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [64, 15657.0] + - - [4096, 192, 1, 2048, 4096, 4096, 4096, 2048] + - [83, 15295.0] + - - [5329, 64, 64, 160, 5329, 5329, 5329, 160] + - [52, 8538.0] + - - [1225, 64, 64, 384, 1225, 1225, 1225, 384] + - [79, 16148.0] + - - [4096, 320, 1, 1280, 4096, 4096, 4096, 1280] + - [75, 15753.0] + - - [4096, 192, 1, 1280, 4096, 4096, 4096, 1280] + - [98, 15078.0] + - - [1225, 96, 64, 384, 1225, 1225, 1225, 384] + - [60, 12895.0] + - - [4096, 320, 1, 2048, 4096, 4096, 4096, 2048] + - [75, 15942.0] + - - [4096, 256, 1, 1536, 4096, 4096, 4096, 1536] + - [76, 16125.0] + - - [64, 147, 432, 148, 64, 64, 64, 148] + - [77, 10360.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [100, 11311.0] + - - [64, 111, 576, 112, 64, 64, 64, 112] + - [77, 11406.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [54, 8346.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [77, 9179.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [77, 11386.0] + - - [64, 85, 752, 84, 64, 64, 64, 84] + - [54, 8893.0] + - - [64, 122, 528, 123, 64, 64, 64, 123] + - [77, 11354.0] + - - [64, 93, 688, 92, 64, 64, 64, 92] + - [61, 9504.0] + - - [64, 102, 624, 99, 64, 64, 64, 99] + - [58, 9798.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [77, 9823.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [58, 12604.0] + - - [64, 162, 400, 159, 64, 64, 64, 159] + - [58, 11646.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [73, 8653.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [100, 9703.0] + - - [64, 101, 624, 102, 64, 64, 64, 102] + - [77, 9899.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [58, 10647.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [73, 11540.0] + - - [64, 135, 480, 132, 64, 64, 64, 132] + - [100, 9818.0] + - - [64, 134, 480, 132, 64, 64, 64, 132] + - [58, 9798.0] + - - [64, 134, 480, 135, 64, 64, 64, 135] + - [100, 9808.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [100, 11592.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [77, 9902.0] + - - [64, 135, 480, 133, 64, 64, 64, 133] + - [100, 9776.0] + - - [64, 148, 432, 143, 64, 64, 64, 143] + - [100, 10780.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [81, 9984.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [96, 7748.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [77, 11227.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [100, 12410.0] + - - [64, 112, 576, 111, 64, 64, 64, 111] + - [58, 10834.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [77, 10466.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [58, 9819.0] + - - [64, 232, 272, 228, 64, 64, 64, 228] + - [100, 12579.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [58, 10684.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [54, 8043.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [73, 8848.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [54, 10671.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [58, 8303.0] + - - [64, 102, 624, 100, 64, 64, 64, 100] + - [58, 10118.0] + - - [64, 78, 816, 77, 64, 64, 64, 77] + - [96, 8298.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [77, 11373.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [77, 10810.0] + - - [64, 159, 400, 160, 64, 64, 64, 160] + - [58, 11794.0] + - - [64, 102, 624, 101, 64, 64, 64, 101] + - [77, 9928.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [58, 9774.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [96, 13682.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [58, 9124.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [58, 10478.0] + - - [64, 100, 624, 102, 64, 64, 64, 102] + - [77, 9892.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [77, 12270.0] + - - [500, 1024, 1, 512, 500, 500, 500, 512] + - [106, 11926.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [106, 12906.0] + - - [200, 2048, 1, 512, 200, 200, 200, 512] + - [83, 9602.0] + - - [512, 2000, 1, 1024, 512, 512, 512, 1024] + - [64, 15285.0] + - - [512, 2048, 1, 512, 512, 512, 512, 512] + - [84, 15064.0] + - - [200, 2000, 1, 100, 200, 200, 200, 100] + - [95, 6135.0] + - - [200, 2000, 1, 1024, 200, 200, 200, 1024] + - [83, 10235.0] + - - [500, 1024, 1, 2048, 500, 500, 500, 2048] + - [83, 13547.0] + - - [512, 2048, 1, 100, 512, 512, 512, 100] + - [98, 11625.0] + - - [512, 2048, 1, 2000, 512, 512, 512, 2000] + - [76, 16438.0] + - - [200, 2000, 1, 10, 200, 200, 200, 10] + - [94, 1538.0] + - - [500, 2048, 1, 1024, 500, 500, 500, 1024] + - [84, 15005.0] + - - [500, 2000, 1, 10, 500, 500, 500, 10] + - [101, 2475.0] + - - [500, 2048, 1, 100, 500, 500, 500, 100] + - [98, 10385.0] + - - [512, 1024, 1, 500, 512, 512, 512, 500] + - [98, 12838.0] + - - [200, 2000, 1, 2000, 200, 200, 200, 2000] + - [76, 10893.0] + - - [500, 2048, 1, 2000, 500, 500, 500, 2000] + - [99, 15832.0] + - - [512, 2048, 1, 1024, 512, 512, 512, 1024] + - [64, 15712.0] + - - [512, 1024, 1, 100, 512, 512, 512, 100] + - [98, 8066.0] + - - [256, 2000, 1, 10, 256, 256, 256, 10] + - [94, 2000.0] + - - [512, 2000, 1, 100, 512, 512, 512, 100] + - [98, 10870.0] + - - [512, 2000, 1, 2048, 512, 512, 512, 2048] + - [64, 15678.0] + - - [500, 1024, 1, 500, 500, 500, 500, 500] + - [56, 12179.0] + - - [256, 2000, 1, 100, 256, 256, 256, 100] + - [97, 7442.0] + - - [512, 1024, 1, 2048, 512, 512, 512, 2048] + - [83, 14154.0] + - - [500, 2048, 1, 2048, 500, 500, 500, 2048] + - [107, 15482.0] + - - [200, 2048, 1, 10, 200, 200, 200, 10] + - [94, 1600.0] + - - [500, 2000, 1, 512, 500, 500, 500, 512] + - [106, 13913.0] + - - [500, 1024, 1, 1024, 500, 500, 500, 1024] + - [106, 12984.0] + - - [200, 2000, 1, 500, 200, 200, 200, 500] + - [56, 9533.0] + - - [256, 2048, 1, 100, 256, 256, 256, 100] + - [98, 8017.0] + - - [500, 2000, 1, 1024, 500, 500, 500, 1024] + - [84, 14654.0] + - - [256, 2048, 1, 1024, 256, 256, 256, 1024] + - [106, 13633.0] + - - [200, 2048, 1, 1024, 200, 200, 200, 1024] + - [83, 10444.0] + - - [512, 2048, 1, 500, 512, 512, 512, 500] + - [76, 15447.0] + - - [512, 2000, 1, 10, 512, 512, 512, 10] + - [94, 2860.0] + - - [500, 1024, 1, 2000, 500, 500, 500, 2000] + - [76, 13921.0] + - - [512, 2000, 1, 512, 512, 512, 512, 512] + - [64, 14507.0] + - - [500, 2000, 1, 2000, 500, 500, 500, 2000] + - [76, 15497.0] + - - [500, 1024, 1, 10, 500, 500, 500, 10] + - [94, 1803.0] + - - [256, 2048, 1, 10, 256, 256, 256, 10] + - [94, 2081.0] + - - [256, 2048, 1, 500, 256, 256, 256, 500] + - [98, 12850.0] + - - [256, 2048, 1, 2048, 256, 256, 256, 2048] + - [106, 14150.0] + - - [256, 2000, 1, 512, 256, 256, 256, 512] + - [64, 11745.0] + - - [512, 1024, 1, 2000, 512, 512, 512, 2000] + - [99, 14523.0] + - - [256, 2000, 1, 2000, 256, 256, 256, 2000] + - [57, 13978.0] + - - [256, 2048, 1, 2000, 256, 256, 256, 2000] + - [76, 14515.0] + - - [200, 2048, 1, 100, 200, 200, 200, 100] + - [55, 6302.0] + - - [200, 2000, 1, 2048, 200, 200, 200, 2048] + - [63, 10650.0] + - - [500, 2048, 1, 512, 500, 500, 500, 512] + - [83, 14294.0] + - - [500, 2000, 1, 500, 500, 500, 500, 500] + - [99, 14180.0] + - - [200, 2048, 1, 2048, 200, 200, 200, 2048] + - [83, 10937.0] + - - [200, 2048, 1, 500, 200, 200, 200, 500] + - [98, 9761.0] + - - [512, 2000, 1, 500, 512, 512, 512, 500] + - [99, 14637.0] + - - [200, 2048, 1, 2000, 200, 200, 200, 2000] + - [76, 11143.0] + - - [500, 1024, 1, 100, 500, 500, 500, 100] + - [53, 7356.0] + - - [512, 1024, 1, 10, 512, 512, 512, 10] + - [94, 2016.0] + - - [512, 1024, 1, 1024, 512, 512, 512, 1024] + - [106, 13696.0] + - - [500, 2048, 1, 10, 500, 500, 500, 10] + - [112, 2522.0] + - - [200, 2000, 1, 512, 200, 200, 200, 512] + - [106, 9369.0] + - - [256, 2000, 1, 500, 256, 256, 256, 500] + - [98, 12075.0] + - - [256, 2048, 1, 512, 256, 256, 256, 512] + - [63, 12807.0] + - - [256, 2000, 1, 2048, 256, 256, 256, 2048] + - [83, 13628.0] + - - [500, 2048, 1, 500, 500, 500, 500, 500] + - [76, 14570.0] + - - [256, 2000, 1, 1024, 256, 256, 256, 1024] + - [106, 12952.0] + - - [500, 2000, 1, 2048, 500, 500, 500, 2048] + - [84, 15125.0] + - - [512, 2000, 1, 2000, 512, 512, 512, 2000] + - [64, 15869.0] + - - [512, 2048, 1, 2048, 512, 512, 512, 2048] + - [64, 16045.0] + - - [512, 2048, 1, 10, 512, 512, 512, 10] + - [94, 2962.0] + - - [500, 2000, 1, 100, 500, 500, 500, 100] + - [96, 9921.0] + - - [1024, 1131, 1, 1024, 1024, 1024, 1024, 1024] + - [107, 16822.0] + - - [1024, 1102, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 16665.0] + - - [1024, 774, 1, 1024, 1024, 1024, 1024, 1024] + - [63, 14756.0] + - - [4096, 128, 1, 2048, 4096, 4096, 4096, 2048] + - [56, 14095.0] + - - [4096, 128, 1, 3072, 4096, 4096, 4096, 3072] + - [56, 14258.0] + - - [1024, 1120, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 16878.0] + - - [1024, 1015, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 15309.0] + - - [1024, 992, 1, 1024, 1024, 1024, 1024, 1024] + - [107, 14997.0] + - - [1024, 950, 1, 1024, 1024, 1024, 1024, 1024] + - [107, 14496.0] + - - [1024, 1088, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 16505.0] + - - [64, 128, 96, 128, 64, 64, 64, 128] + - [108, 9514.0] + - - [768, 1024, 1, 3072, 768, 768, 768, 3072] + - [83, 15314.0] + - - [768, 512, 1, 3072, 768, 768, 768, 3072] + - [97, 10956.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [65, 13197.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [108, 11671.0] + - - [64, 256, 96, 256, 64, 64, 64, 256] + - [65, 11685.0] + - - [6272, 112, 1, 512, 6272, 6272, 6272, 512] + - [98, 12497.0] + - - [2048, 320, 1, 1280, 2048, 2048, 2048, 1280] + - [98, 12668.0] + - - [5329, 64, 1, 448, 5329, 5329, 5329, 448] + - [103, 8470.0] + - - [784, 64, 32, 192, 784, 784, 784, 192] + - [54, 14090.0] + - - [6272, 64, 1, 480, 6272, 6272, 6272, 480] + - [55, 10370.0] + - - [6272, 64, 1, 512, 6272, 6272, 6272, 512] + - [55, 10411.0] + - - [6272, 160, 1, 528, 6272, 6272, 6272, 528] + - [75, 12075.0] + - - [289, 160, 32, 768, 289, 289, 289, 768] + - [83, 11446.0] + - - [5329, 64, 32, 160, 5329, 5329, 5329, 160] + - [83, 12315.0] + - - [5329, 96, 1, 576, 5329, 5329, 5329, 576] + - [98, 8984.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 288] + - [56, 16618.0] + - - [289, 192, 32, 768, 289, 289, 289, 768] + - [83, 13691.0] + - - [2048, 448, 1, 1280, 2048, 2048, 2048, 1280] + - [76, 13984.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [54, 15761.0] + - - [6272, 128, 1, 528, 6272, 6272, 6272, 528] + - [75, 14739.0] + - - [6272, 96, 1, 480, 6272, 6272, 6272, 480] + - [98, 10768.0] + - - [2048, 448, 1, 2048, 2048, 2048, 2048, 2048] + - [76, 14052.0] + - - [784, 96, 32, 192, 784, 784, 784, 192] + - [75, 11351.0] + - - [1001, 512, 1, 4096, 1001, 1001, 1001, 4096] + - [102, 13986.0] + - - [2048, 192, 1, 1280, 2048, 2048, 2048, 1280] + - [55, 10732.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 256] + - [79, 15365.0] + - - [2048, 256, 1, 1536, 2048, 2048, 2048, 1536] + - [56, 14040.0] + - - [6272, 128, 1, 512, 6272, 6272, 6272, 512] + - [75, 14597.0] + - - [1568, 384, 1, 832, 1568, 1568, 1568, 832] + - [75, 11288.0] + - - [1568, 256, 1, 832, 1568, 1568, 1568, 832] + - [55, 10697.0] + - - [1568, 192, 1, 832, 1568, 1568, 1568, 832] + - [72, 8239.0] + - - [289, 192, 32, 1024, 289, 289, 289, 1024] + - [63, 13677.0] + - - [1225, 64, 32, 384, 1225, 1225, 1225, 384] + - [56, 16824.0] + - - [2048, 320, 1, 2048, 2048, 2048, 2048, 2048] + - [63, 12770.0] + - - [2048, 384, 1, 1536, 2048, 2048, 2048, 1536] + - [75, 15126.0] + - - [5041, 96, 1, 576, 5041, 5041, 5041, 576] + - [98, 8917.0] + - - [6272, 192, 1, 480, 6272, 6272, 6272, 480] + - [98, 14451.0] + - - [5041, 192, 1, 720, 5041, 5041, 5041, 720] + - [98, 13971.0] + - - [289, 128, 32, 768, 289, 289, 289, 768] + - [107, 12138.0] + - - [12544, 64, 1, 147, 12544, 12544, 12544, 147] + - [98, 11570.0] + - - [6272, 160, 1, 512, 6272, 6272, 6272, 512] + - [98, 12050.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 192] + - [106, 15929.0] + - - [784, 64, 32, 256, 784, 784, 784, 256] + - [106, 13116.0] + - - [6272, 144, 1, 512, 6272, 6272, 6272, 512] + - [56, 10749.0] + - - [8192, 192, 1, 1280, 8192, 8192, 8192, 1280] + - [98, 16294.0] + - - [8192, 192, 1, 2048, 8192, 8192, 8192, 2048] + - [56, 16358.0] + - - [65, 6400, 1, 1024, 65, 65, 65, 1024] + - [63, 7712.0] + - - [512, 1290, 1, 2048, 512, 512, 512, 2048] + - [106, 12792.0] + - - [512, 2205, 1, 2048, 512, 512, 512, 2048] + - [64, 17203.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [108, 11120.0] + - - [512, 600, 1, 2048, 512, 512, 512, 2048] + - [105, 8490.0] + - - [512, 644, 1, 512, 512, 512, 512, 512] + - [82, 8341.0] + - - [512, 644, 1, 2048, 512, 512, 512, 2048] + - [82, 9093.0] + - - [512, 668, 1, 2048, 512, 512, 512, 2048] + - [82, 9450.0] + - - [512, 714, 1, 512, 512, 512, 512, 512] + - [105, 9175.0] + - - [512, 714, 1, 2048, 512, 512, 512, 2048] + - [105, 10079.0] + - - [512, 720, 1, 512, 512, 512, 512, 512] + - [82, 9261.0] + - - [512, 720, 1, 2048, 512, 512, 512, 2048] + - [105, 10156.0] + - - [512, 722, 1, 2048, 512, 512, 512, 2048] + - [105, 10187.0] + - - [512, 781, 1, 512, 512, 512, 512, 512] + - [105, 9997.0] + - - [512, 781, 1, 2048, 512, 512, 512, 2048] + - [82, 11031.0] + - - [512, 848, 1, 2048, 512, 512, 512, 2048] + - [83, 11654.0] + - - [512, 872, 1, 2048, 512, 512, 512, 2048] + - [83, 11999.0] + - - [512, 936, 1, 512, 512, 512, 512, 512] + - [106, 11360.0] + - - [512, 936, 1, 2048, 512, 512, 512, 2048] + - [83, 12836.0] + - - [512, 980, 1, 512, 512, 512, 512, 512] + - [64, 11541.0] + - - [512, 980, 1, 2048, 512, 512, 512, 2048] + - [106, 13422.0] + - - [512, 1139, 1, 2048, 512, 512, 512, 2048] + - [83, 15467.0] + - - [512, 1184, 1, 2048, 512, 512, 512, 2048] + - [83, 11706.0] + - - [512, 1186, 1, 2048, 512, 512, 512, 2048] + - [83, 11779.0] + - - [512, 1232, 1, 512, 512, 512, 512, 512] + - [106, 11245.0] + - - [512, 1232, 1, 2048, 512, 512, 512, 2048] + - [83, 12296.0] + - - [512, 1279, 1, 2048, 512, 512, 512, 2048] + - [83, 12717.0] + - - [512, 1290, 1, 512, 512, 512, 512, 512] + - [106, 11693.0] + - - [512, 1327, 1, 2048, 512, 512, 512, 2048] + - [83, 13152.0] + - - [512, 1331, 1, 2048, 512, 512, 512, 2048] + - [106, 13117.0] + - - [512, 1341, 1, 2048, 512, 512, 512, 2048] + - [83, 13293.0] + - - [512, 1350, 1, 512, 512, 512, 512, 512] + - [106, 12297.0] + - - [512, 1350, 1, 2048, 512, 512, 512, 2048] + - [83, 13441.0] + - - [512, 1359, 1, 2048, 512, 512, 512, 2048] + - [106, 13438.0] + - - [512, 1391, 1, 2048, 512, 512, 512, 2048] + - [83, 13783.0] + - - [512, 1424, 1, 512, 512, 512, 512, 512] + - [106, 12881.0] + - - [512, 1424, 1, 2048, 512, 512, 512, 2048] + - [63, 14102.0] + - - [512, 1458, 1, 512, 512, 512, 512, 512] + - [106, 13189.0] + - - [512, 1458, 1, 2048, 512, 512, 512, 2048] + - [106, 14404.0] + - - [512, 1462, 1, 512, 512, 512, 512, 512] + - [106, 13161.0] + - - [512, 1462, 1, 2048, 512, 512, 512, 2048] + - [106, 14419.0] + - - [512, 1467, 1, 2048, 512, 512, 512, 2048] + - [83, 14436.0] + - - [512, 1472, 1, 2048, 512, 512, 512, 2048] + - [106, 14592.0] + - - [512, 1520, 1, 512, 512, 512, 512, 512] + - [106, 13683.0] + - - [512, 1520, 1, 2048, 512, 512, 512, 2048] + - [83, 14982.0] + - - [512, 1596, 1, 512, 512, 512, 512, 512] + - [63, 14144.0] + - - [512, 1596, 1, 2048, 512, 512, 512, 2048] + - [83, 15667.0] + - - [512, 1599, 1, 512, 512, 512, 512, 512] + - [63, 14057.0] + - - [512, 1599, 1, 2048, 512, 512, 512, 2048] + - [63, 15687.0] + - - [512, 1615, 1, 512, 512, 512, 512, 512] + - [63, 14351.0] + - - [512, 1615, 1, 2048, 512, 512, 512, 2048] + - [106, 15898.0] + - - [512, 1680, 1, 512, 512, 512, 512, 512] + - [84, 12343.0] + - - [512, 1680, 1, 2048, 512, 512, 512, 2048] + - [107, 13263.0] + - - [512, 1709, 1, 2048, 512, 512, 512, 2048] + - [64, 13498.0] + - - [512, 1890, 1, 512, 512, 512, 512, 512] + - [84, 13724.0] + - - [512, 1902, 1, 2048, 512, 512, 512, 2048] + - [84, 14926.0] + - - [512, 1917, 1, 512, 512, 512, 512, 512] + - [84, 13798.0] + - - [512, 1917, 1, 2048, 512, 512, 512, 2048] + - [107, 14965.0] + - - [512, 2076, 1, 2048, 512, 512, 512, 2048] + - [107, 16240.0] + - - [512, 2195, 1, 2048, 512, 512, 512, 2048] + - [84, 17115.0] + - - [512, 2205, 1, 512, 512, 512, 512, 512] + - [107, 15906.0] + - - [2048, 198, 1, 512, 2048, 2048, 2048, 512] + - [75, 9684.0] + - - [2048, 207, 1, 512, 2048, 2048, 2048, 512] + - [75, 10124.0] + - - [2048, 208, 1, 512, 2048, 2048, 2048, 512] + - [75, 10144.0] + - - [2048, 245, 1, 512, 2048, 2048, 2048, 512] + - [83, 11489.0] + - - [2048, 246, 1, 512, 2048, 2048, 2048, 512] + - [75, 11588.0] + - - [2048, 264, 1, 512, 2048, 2048, 2048, 512] + - [56, 9652.0] + - - [2048, 401, 1, 512, 2048, 2048, 2048, 512] + - [76, 12202.0] + - - [2048, 439, 1, 512, 2048, 2048, 2048, 512] + - [99, 12981.0] + - - [2048, 443, 1, 512, 2048, 2048, 2048, 512] + - [99, 13056.0] + - - [2048, 446, 1, 512, 2048, 2048, 2048, 512] + - [76, 13100.0] + - - [2048, 465, 1, 512, 2048, 2048, 2048, 512] + - [98, 13650.0] + - - [2048, 468, 1, 512, 2048, 2048, 2048, 512] + - [98, 13662.0] + - - [2048, 493, 1, 512, 2048, 2048, 2048, 512] + - [75, 14124.0] + - - [2048, 495, 1, 512, 2048, 2048, 2048, 512] + - [98, 14346.0] + - - [2048, 511, 1, 512, 2048, 2048, 2048, 512] + - [98, 14720.0] + - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] + - [76, 15366.0] + - - [2048, 540, 1, 512, 2048, 2048, 2048, 512] + - [63, 15328.0] + - - [2048, 550, 1, 512, 2048, 2048, 2048, 512] + - [63, 15545.0] + - - [2048, 560, 1, 512, 2048, 2048, 2048, 512] + - [63, 15785.0] + - - [2048, 600, 1, 512, 2048, 2048, 2048, 512] + - [98, 14266.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [78, 8930.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [54, 7208.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [96, 6978.0] + - - [64, 70, 216, 70, 64, 64, 64, 70] + - [54, 6733.0] + - - [64, 71, 216, 71, 64, 64, 64, 71] + - [54, 6805.0] + - - [64, 78, 248, 77, 64, 64, 64, 77] + - [96, 7530.0] + - - [64, 80, 152, 80, 64, 64, 64, 80] + - [73, 6540.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [55, 8082.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [55, 8744.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [78, 9717.0] + - - [64, 122, 264, 123, 64, 64, 64, 123] + - [59, 9752.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [78, 9877.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [108, 13916.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [69, 10857.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [108, 11312.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [65, 12477.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [64, 16076.0] + - - [512, 1600, 1, 32, 512, 512, 512, 32] + - [94, 6012.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [63, 14842.0] + - - [560, 1600, 1, 1024, 560, 560, 560, 1024] + - [64, 13328.0] + - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] + - [80, 14268.0] + - - [64, 192, 64, 1280, 64, 64, 64, 1280] + - [65, 13487.0] + - - [64, 320, 64, 1280, 64, 64, 64, 1280] + - [65, 12681.0] + - - [64, 384, 64, 1280, 64, 64, 64, 1280] + - [108, 10962.0] + - - [64, 448, 64, 1280, 64, 64, 64, 1280] + - [91, 10297.0] + - - [64, 192, 64, 2048, 64, 64, 64, 2048] + - [108, 12825.0] + - - [64, 320, 64, 2048, 64, 64, 64, 2048] + - [109, 8755.0] + - - [64, 384, 64, 2048, 64, 64, 64, 2048] + - [71, 8647.0] + - - [64, 448, 64, 2048, 64, 64, 64, 2048] + - [92, 8459.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 192] + - [75, 17045.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 256] + - [63, 17081.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 288] + - [98, 17537.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 64] + - [52, 7702.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [75, 16621.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [66, 10156.0] + - - [64, 192, 32, 1280, 64, 64, 64, 1280] + - [82, 10038.0] + - - [64, 320, 32, 1280, 64, 64, 64, 1280] + - [65, 11401.0] + - - [64, 384, 32, 1280, 64, 64, 64, 1280] + - [65, 13472.0] + - - [64, 448, 32, 1280, 64, 64, 64, 1280] + - [108, 12457.0] + - - [64, 192, 32, 2048, 64, 64, 64, 2048] + - [62, 10264.0] + - - [64, 320, 32, 2048, 64, 64, 64, 2048] + - [85, 11584.0] + - - [64, 384, 32, 2048, 64, 64, 64, 2048] + - [65, 13415.0] + - - [64, 448, 32, 2048, 64, 64, 64, 2048] + - [65, 12346.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 64] + - [98, 10782.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 256] + - [75, 17717.0] + - - [196, 256, 32, 1024, 196, 196, 196, 1024] + - [106, 12458.0] + - - [256, 4096, 1, 4, 256, 256, 256, 4] + - [94, 1371.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [84, 14513.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [98, 15029.0] + - - [768, 768, 1, 384, 768, 768, 768, 384] + - [98, 13895.0] + - - [100, 128, 120, 512, 100, 100, 100, 512] + - [106, 12888.0] + - - [100, 128, 139, 512, 100, 100, 100, 512] + - [106, 13134.0] + - - [100, 128, 160, 512, 100, 100, 100, 512] + - [67, 13163.0] + - - [22500, 64, 1, 147, 22500, 22500, 22500, 147] + - [83, 13745.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 14559.0] + - - [1024, 616, 1, 1024, 1024, 1024, 1024, 1024] + - [63, 11783.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [108, 10437.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [68, 9811.0] + - - [1024, 1024, 1, 2, 1024, 1024, 1024, 2] + - [61, 681.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [65, 13039.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [63, 14844.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [108, 12817.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 15169.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [65, 12930.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 15526.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [85, 13533.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [98, 10143.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [98, 12154.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [108, 11152.0] + - - [1024, 960, 1, 64, 1024, 1024, 1024, 64] + - [54, 9769.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [71, 9821.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [85, 11578.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [111, 10564.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [90, 10809.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [113, 9762.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [108, 11546.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [113, 9783.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [113, 9727.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [108, 12990.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [84, 12928.0] + - - [1024, 864, 1, 512, 1024, 1024, 1024, 512] + - [63, 12479.0] + - - [256, 3456, 1, 128, 256, 256, 256, 128] + - [106, 10973.0] + - - [256, 4096, 1, 128, 256, 256, 256, 128] + - [98, 12544.0] + - - [480, 864, 1, 1024, 480, 480, 480, 1024] + - [83, 10638.0] + - - [512, 864, 1, 256, 512, 512, 512, 256] + - [104, 8579.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [109, 7236.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [86, 7035.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [113, 9678.0] + - - [256, 4096, 1, 1, 256, 256, 256, 1] + - [94, 324.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [70, 6095.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [70, 6122.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [89, 9533.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [108, 14088.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [65, 12401.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [110, 10763.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [68, 12041.0] + - - [950, 512, 2, 2048, 950, 950, 950, 2048] + - [64, 14649.0] + - - [3400, 256, 1, 1024, 3400, 3400, 3400, 1024] + - [76, 13019.0] + - - [3800, 256, 1, 1024, 3800, 3800, 3800, 1024] + - [57, 14466.0] + - - [850, 512, 2, 2048, 850, 850, 850, 2048] + - [64, 13277.0] + - - [805, 512, 2, 2048, 805, 805, 805, 2048] + - [65, 14040.0] + - - [864, 512, 2, 2048, 864, 864, 864, 2048] + - [64, 13512.0] + - - [950, 256, 2, 2048, 950, 950, 950, 2048] + - [63, 12791.0] + - - [888, 512, 2, 2048, 888, 888, 888, 2048] + - [107, 13829.0] + - - [51520, 64, 2, 256, 51520, 51520, 51520, 256] + - [56, 17940.0] + - - [46464, 64, 2, 256, 46464, 46464, 46464, 256] + - [98, 18117.0] + - - [49152, 64, 2, 256, 49152, 49152, 49152, 256] + - [98, 18236.0] + - - [1900, 512, 1, 1024, 1900, 1900, 1900, 1024] + - [84, 14354.0] + - - [1700, 512, 1, 1024, 1700, 1700, 1700, 1024] + - [84, 12966.0] + - - [1610, 512, 1, 1024, 1610, 1610, 1610, 1024] + - [106, 15063.0] + - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] + - [83, 14809.0] + - - [1728, 512, 1, 1024, 1728, 1728, 1728, 1024] + - [107, 13191.0] + - - [1024, 1024, 1, 320, 1024, 1024, 1024, 320] + - [76, 14821.0] + - - [51520, 64, 2, 64, 51520, 51520, 51520, 64] + - [104, 15902.0] + - - [55296, 64, 2, 64, 55296, 55296, 55296, 64] + - [98, 16484.0] + - - [49152, 64, 2, 64, 49152, 49152, 49152, 64] + - [56, 16055.0] + - - [54400, 64, 2, 64, 54400, 54400, 54400, 64] + - [63, 16518.0] + - - [42240, 64, 2, 256, 42240, 42240, 42240, 256] + - [98, 18117.0] + - - [672, 512, 2, 2048, 672, 672, 672, 2048] + - [83, 13275.0] + - - [54400, 64, 2, 256, 54400, 54400, 54400, 256] + - [88, 17476.0] + - - [56832, 64, 2, 256, 56832, 56832, 56832, 256] + - [67, 17001.0] + - - [55296, 64, 2, 256, 55296, 55296, 55296, 256] + - [111, 17224.0] + - - [60800, 64, 2, 64, 60800, 60800, 60800, 64] + - [56, 15692.0] + - - [660, 512, 2, 2048, 660, 660, 660, 2048] + - [83, 12992.0] + - - [768, 512, 2, 2048, 768, 768, 768, 2048] + - [83, 15197.0] + - - [43008, 64, 2, 256, 43008, 43008, 43008, 256] + - [98, 18379.0] + - - [864, 256, 2, 2048, 864, 864, 864, 2048] + - [63, 11720.0] + - - [726, 512, 2, 2048, 726, 726, 726, 2048] + - [83, 14275.0] + - - [768, 256, 2, 2048, 768, 768, 768, 2048] + - [105, 10821.0] + - - [45632, 64, 2, 256, 45632, 45632, 45632, 256] + - [63, 18182.0] + - - [713, 512, 2, 2048, 713, 713, 713, 2048] + - [83, 13990.0] + - - [805, 256, 2, 2048, 805, 805, 805, 2048] + - [82, 11291.0] + - - [60800, 64, 2, 256, 60800, 60800, 60800, 256] + - [67, 14944.0] + - - [850, 256, 2, 2048, 850, 850, 850, 2048] + - [83, 11259.0] + - - [1024, 1024, 1, 81, 1024, 1024, 1024, 81] + - [56, 10697.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [88, 11041.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [67, 11309.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [88, 10592.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [67, 10779.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [63, 13340.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [67, 10882.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [83, 12222.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [111, 12686.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [71, 9893.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [71, 9826.0] + - - [1024, 80, 1, 30522, 1024, 1024, 1024, 30522] + - [114, 8613.0] + - - [1024, 120, 1, 30522, 1024, 1024, 1024, 30522] + - [114, 12796.0] + - - [1024, 77, 1, 30522, 1024, 1024, 1024, 30522] + - [114, 8304.0] + - - [1024, 200, 1, 30522, 1024, 1024, 1024, 30522] + - [120, 12386.0] + - - [1024, 160, 1, 30522, 1024, 1024, 1024, 30522] + - [114, 12304.0] + - - [1024, 180, 1, 30522, 1024, 1024, 1024, 30522] + - [114, 13900.0] + - - [1024, 160, 1, 30528, 1024, 1024, 1024, 30528] + - [116, 12357.0] + - - [1024, 240, 1, 30528, 1024, 1024, 1024, 30528] + - [115, 14701.0] + - - [2560, 109, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 13110.0] + - - [2560, 121, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 14538.0] + - - [2560, 65, 1, 29000, 2560, 2560, 2560, 29000] + - [116, 7860.0] + - - [2560, 66, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 7973.0] + - - [2560, 67, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 8108.0] + - - [2560, 69, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 8346.0] + - - [2560, 70, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 8461.0] + - - [2560, 71, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 8567.0] + - - [2560, 73, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 8813.0] + - - [2560, 74, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 8910.0] + - - [2560, 75, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 9062.0] + - - [2560, 77, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 9300.0] + - - [2560, 78, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 9427.0] + - - [2560, 80, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 9662.0] + - - [2560, 81, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 9767.0] + - - [2560, 82, 1, 29000, 2560, 2560, 2560, 29000] + - [118, 9857.0] + - - [2560, 83, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 10032.0] + - - [2560, 84, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 10137.0] + - - [2560, 88, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 10605.0] + - - [2560, 89, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 10737.0] + - - [2560, 90, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 10852.0] + - - [2560, 92, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 11095.0] + - - [2560, 95, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 11426.0] + - - [2560, 98, 1, 29000, 2560, 2560, 2560, 29000] + - [119, 11806.0] + - - [512, 200, 1, 32, 512, 512, 512, 32] + - [125, 1214.0] + - - [1024, 200, 1, 1, 1024, 1024, 1024, 1] + - [144, 120.0] + - - [512, 200, 1, 1, 512, 512, 512, 1] + - [146, 66.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [170, 6535.0] + - - [768, 160, 1, 768, 768, 768, 768, 768] + - [124, 5308.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [169, 5094.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [145, 5641.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [136, 5423.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [138, 6199.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 128] + - [123, 2958.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [123, 4173.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [129, 5730.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [124, 6747.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [124, 5570.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [145, 7202.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [147, 7131.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [169, 6824.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [124, 5061.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [136, 4234.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [147, 7473.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [155, 6352.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [126, 6096.0] + - - [704, 256, 1, 128, 704, 704, 704, 128] + - [124, 4471.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [169, 4419.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [147, 6157.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [124, 5919.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [136, 6385.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [163, 6331.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [154, 5703.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [147, 6248.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [154, 5657.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [161, 4046.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [124, 5579.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [136, 3762.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [169, 4369.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [147, 6484.0] + - - [35, 8457, 1, 1760, 35, 35, 35, 1760] + - [147, 4139.0] + - - [64, 2944, 1, 128, 64, 64, 64, 128] + - [169, 4603.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [154, 6970.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [124, 6258.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [147, 6443.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [154, 4230.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [147, 7066.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [147, 4985.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [136, 4473.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [136, 5303.0] + - - [35, 8457, 1, 2560, 35, 35, 35, 2560] + - [155, 4136.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 128] + - [126, 5611.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [154, 5690.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [169, 5315.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [124, 4501.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [124, 3495.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [124, 4428.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [147, 7327.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [124, 5851.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [124, 4471.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [154, 6284.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 128] + - [124, 4692.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [154, 5906.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [123, 3517.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [136, 5457.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [170, 7536.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [163, 6691.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [126, 6388.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [155, 5478.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 128] + - [124, 4690.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [147, 7151.0] + - - [64, 4288, 1, 128, 64, 64, 64, 128] + - [155, 5647.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [164, 6114.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [169, 6393.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [154, 6745.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [154, 4433.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [136, 6610.0] + - - [35, 8457, 1, 4096, 35, 35, 35, 4096] + - [155, 3987.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [163, 4846.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 128] + - [163, 3774.0] + - - [256, 1024, 1, 128, 256, 256, 256, 128] + - [147, 5430.0] + - - [64, 1408, 1, 128, 64, 64, 64, 128] + - [135, 2869.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [145, 6479.0] + - - [35, 8457, 1, 2048, 35, 35, 35, 2048] + - [155, 4133.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [169, 5544.0] + - - [448, 256, 1, 128, 448, 448, 448, 128] + - [136, 3336.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [169, 4359.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [124, 5352.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [169, 6557.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [123, 2927.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [136, 5465.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [145, 6306.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [136, 5460.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 128] + - [136, 3364.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [138, 7094.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [123, 2927.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [169, 6466.0] + - - [256, 448, 1, 128, 256, 256, 256, 128] + - [163, 3191.0] + - - [64, 3584, 1, 128, 64, 64, 64, 128] + - [155, 4766.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [124, 4369.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [124, 6525.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [124, 6455.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [123, 3549.0] + - - [64, 1856, 1, 128, 64, 64, 64, 128] + - [136, 3471.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [169, 4480.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [126, 5018.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [124, 5332.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [147, 6517.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [155, 7430.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [124, 4296.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 128] + - [147, 5377.0] + - - [256, 704, 1, 128, 256, 256, 256, 128] + - [124, 4419.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [163, 7004.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [126, 7106.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [136, 4891.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [170, 6257.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [138, 7244.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [124, 5239.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [154, 4343.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [150, 6175.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [123, 3495.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [124, 5387.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [136, 5874.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [136, 4455.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [155, 5673.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [155, 6517.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [169, 4235.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [153, 3884.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [124, 6286.0] + - - [64, 2368, 1, 128, 64, 64, 64, 128] + - [136, 3731.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [147, 7187.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [126, 5759.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [166, 5604.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [153, 3474.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [154, 6275.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [170, 6587.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [161, 4927.0] + - - [448, 448, 1, 128, 448, 448, 448, 128] + - [124, 4959.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [163, 5328.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 6991.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [136, 4319.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 5476.0] + - - [512, 256, 1, 1024, 512, 512, 512, 1024] + - [169, 6101.0] + - - [1024, 256, 1, 2048, 1024, 1024, 1024, 2048] + - [126, 7166.0] + - - [1024, 200, 1, 4096, 1024, 1024, 1024, 4096] + - [147, 5587.0] + - - [1024, 200, 1, 512, 1024, 1024, 1024, 512] + - [147, 5238.0] + - - [512, 200, 1, 1024, 512, 512, 512, 1024] + - [169, 4775.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [169, 5519.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 4096] + - [138, 7123.0] + - - [1024, 200, 1, 2048, 1024, 1024, 1024, 2048] + - [126, 5613.0] + - - [1024, 256, 1, 512, 1024, 1024, 1024, 512] + - [126, 6684.0] + - - [512, 200, 1, 2048, 512, 512, 512, 2048] + - [136, 4803.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [142, 6647.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [136, 4932.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [136, 7458.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [136, 6918.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [169, 6120.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [136, 5774.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [163, 4466.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [136, 5141.0] + - - [512, 512, 1, 1024, 512, 512, 512, 1024] + - [170, 7060.0] + - - [512, 512, 1, 2000, 512, 512, 512, 2000] + - [126, 7182.0] + - - [100, 1024, 1, 2048, 100, 100, 100, 2048] + - [154, 4877.0] + - - [100, 2000, 1, 1024, 100, 100, 100, 1024] + - [138, 5373.0] + - - [128, 2000, 1, 100, 128, 128, 128, 100] + - [147, 4812.0] + - - [64, 2000, 1, 1024, 64, 64, 64, 1024] + - [136, 5920.0] + - - [100, 1024, 1, 1024, 100, 100, 100, 1024] + - [169, 4788.0] + - - [128, 1024, 1, 512, 128, 128, 128, 512] + - [136, 5697.0] + - - [512, 500, 1, 2000, 512, 512, 512, 2000] + - [147, 6996.0] + - - [500, 512, 1, 100, 500, 500, 500, 100] + - [126, 4741.0] + - - [100, 1024, 1, 500, 100, 100, 100, 500] + - [136, 4245.0] + - - [128, 2000, 1, 512, 128, 128, 128, 512] + - [155, 6463.0] + - - [256, 1024, 1, 100, 256, 256, 256, 100] + - [147, 4855.0] + - - [200, 500, 1, 1024, 200, 200, 200, 1024] + - [169, 4693.0] + - - [100, 2000, 1, 512, 100, 100, 100, 512] + - [138, 5105.0] + - - [200, 512, 1, 100, 200, 200, 200, 100] + - [167, 2522.0] + - - [64, 2048, 1, 10, 64, 64, 64, 10] + - [125, 712.0] + - - [64, 2048, 1, 500, 64, 64, 64, 500] + - [169, 5507.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [147, 6806.0] + - - [500, 500, 1, 2000, 500, 500, 500, 2000] + - [126, 6844.0] + - - [256, 500, 1, 10, 256, 256, 256, 10] + - [162, 667.0] + - - [512, 500, 1, 512, 512, 512, 512, 512] + - [147, 6554.0] + - - [128, 1024, 1, 2000, 128, 128, 128, 2000] + - [145, 6289.0] + - - [100, 2000, 1, 2048, 100, 100, 100, 2048] + - [138, 5473.0] + - - [256, 512, 1, 10, 256, 256, 256, 10] + - [146, 683.0] + - - [64, 2000, 1, 2048, 64, 64, 64, 2048] + - [169, 5909.0] + - - [64, 2048, 1, 512, 64, 64, 64, 512] + - [136, 5649.0] + - - [64, 2000, 1, 10, 64, 64, 64, 10] + - [125, 696.0] + - - [128, 1024, 1, 500, 128, 128, 128, 500] + - [136, 5452.0] + - - [200, 512, 1, 1024, 200, 200, 200, 1024] + - [169, 4792.0] + - - [128, 2048, 1, 10, 128, 128, 128, 10] + - [162, 1214.0] + - - [64, 2048, 1, 100, 64, 64, 64, 100] + - [169, 3136.0] + - - [64, 2000, 1, 100, 64, 64, 64, 100] + - [169, 2936.0] + - - [200, 500, 1, 100, 200, 200, 200, 100] + - [160, 2415.0] + - - [500, 500, 1, 500, 500, 500, 500, 500] + - [147, 6391.0] + - - [128, 2048, 1, 512, 128, 128, 128, 512] + - [155, 6765.0] + - - [100, 2048, 1, 500, 100, 100, 100, 500] + - [147, 5203.0] + - - [500, 500, 1, 2048, 500, 500, 500, 2048] + - [147, 6799.0] + - - [128, 2000, 1, 2000, 128, 128, 128, 2000] + - [126, 6993.0] + - - [256, 500, 1, 1024, 256, 256, 256, 1024] + - [169, 5952.0] + - - [64, 2048, 1, 2000, 64, 64, 64, 2000] + - [136, 6403.0] + - - [100, 2048, 1, 1024, 100, 100, 100, 1024] + - [138, 5490.0] + - - [128, 1024, 1, 100, 128, 128, 128, 100] + - [163, 3034.0] + - - [256, 1024, 1, 2048, 256, 256, 256, 2048] + - [138, 7170.0] + - - [500, 512, 1, 512, 500, 500, 500, 512] + - [155, 6541.0] + - - [256, 500, 1, 2000, 256, 256, 256, 2000] + - [124, 6139.0] + - - [256, 512, 1, 100, 256, 256, 256, 100] + - [163, 3006.0] + - - [128, 2000, 1, 500, 128, 128, 128, 500] + - [147, 6517.0] + - - [200, 512, 1, 2048, 200, 200, 200, 2048] + - [145, 4638.0] + - - [64, 2048, 1, 2048, 64, 64, 64, 2048] + - [169, 6095.0] + - - [200, 1024, 1, 2048, 200, 200, 200, 2048] + - [138, 5609.0] + - - [512, 512, 1, 10, 512, 512, 512, 10] + - [162, 1225.0] + - - [512, 500, 1, 10, 512, 512, 512, 10] + - [162, 1164.0] + - - [200, 512, 1, 10, 200, 200, 200, 10] + - [125, 569.0] + - - [500, 500, 1, 1024, 500, 500, 500, 1024] + - [138, 6695.0] + - - [256, 1024, 1, 512, 256, 256, 256, 512] + - [170, 6758.0] + - - [256, 500, 1, 512, 256, 256, 256, 512] + - [169, 5354.0] + - - [200, 500, 1, 2048, 200, 200, 200, 2048] + - [152, 4391.0] + - - [100, 2000, 1, 10, 100, 100, 100, 10] + - [162, 901.0] + - - [100, 2048, 1, 2048, 100, 100, 100, 2048] + - [138, 5604.0] + - - [128, 1024, 1, 2048, 128, 128, 128, 2048] + - [154, 6199.0] + - - [100, 2000, 1, 500, 100, 100, 100, 500] + - [126, 5076.0] + - - [100, 2048, 1, 100, 100, 100, 100, 100] + - [147, 3793.0] + - - [100, 1024, 1, 10, 100, 100, 100, 10] + - [125, 545.0] + - - [100, 1024, 1, 2000, 100, 100, 100, 2000] + - [154, 4904.0] + - - [256, 512, 1, 500, 256, 256, 256, 500] + - [145, 5398.0] + - - [100, 2000, 1, 100, 100, 100, 100, 100] + - [147, 3717.0] + - - [128, 1024, 1, 10, 128, 128, 128, 10] + - [162, 697.0] + - - [100, 2048, 1, 10, 100, 100, 100, 10] + - [162, 914.0] + - - [512, 500, 1, 100, 512, 512, 512, 100] + - [147, 4830.0] + - - [128, 2000, 1, 1024, 128, 128, 128, 1024] + - [138, 6852.0] + - - [200, 1024, 1, 500, 200, 200, 200, 500] + - [147, 5257.0] + - - [256, 512, 1, 2000, 256, 256, 256, 2000] + - [124, 6299.0] + - - [256, 1024, 1, 2000, 256, 256, 256, 2000] + - [147, 7214.0] + - - [200, 512, 1, 500, 200, 200, 200, 500] + - [136, 4274.0] + - - [64, 2000, 1, 512, 64, 64, 64, 512] + - [169, 5372.0] + - - [200, 1024, 1, 100, 200, 200, 200, 100] + - [147, 3894.0] + - - [200, 1024, 1, 1024, 200, 200, 200, 1024] + - [138, 5496.0] + - - [500, 512, 1, 2000, 500, 500, 500, 2000] + - [126, 7012.0] + - - [200, 500, 1, 512, 200, 200, 200, 512] + - [154, 4204.0] + - - [256, 512, 1, 512, 256, 256, 256, 512] + - [136, 5592.0] + - - [512, 512, 1, 500, 512, 512, 512, 500] + - [147, 6805.0] + - - [100, 1024, 1, 512, 100, 100, 100, 512] + - [154, 4376.0] + - - [128, 1024, 1, 1024, 128, 128, 128, 1024] + - [136, 6179.0] + - - [200, 512, 1, 2000, 200, 200, 200, 2000] + - [145, 4865.0] + - - [256, 1024, 1, 500, 256, 256, 256, 500] + - [147, 6735.0] + - - [200, 1024, 1, 512, 200, 200, 200, 512] + - [138, 5285.0] + - - [256, 500, 1, 500, 256, 256, 256, 500] + - [136, 5255.0] + - - [256, 500, 1, 2048, 256, 256, 256, 2048] + - [145, 5931.0] + - - [512, 500, 1, 1024, 512, 512, 512, 1024] + - [170, 6877.0] + - - [256, 512, 1, 1024, 256, 256, 256, 1024] + - [136, 6134.0] + - - [128, 2048, 1, 1024, 128, 128, 128, 1024] + - [155, 7064.0] + - - [500, 512, 1, 500, 500, 500, 500, 500] + - [147, 6584.0] + - - [200, 500, 1, 500, 200, 200, 200, 500] + - [136, 4160.0] + - - [64, 2000, 1, 2000, 64, 64, 64, 2000] + - [169, 6163.0] + - - [128, 2000, 1, 2048, 128, 128, 128, 2048] + - [138, 7002.0] + - - [256, 1024, 1, 10, 256, 256, 256, 10] + - [162, 1192.0] + - - [256, 1024, 1, 1024, 256, 256, 256, 1024] + - [155, 7049.0] + - - [500, 500, 1, 10, 500, 500, 500, 10] + - [125, 1106.0] + - - [256, 500, 1, 100, 256, 256, 256, 100] + - [136, 2857.0] + - - [256, 512, 1, 2048, 256, 256, 256, 2048] + - [145, 6168.0] + - - [200, 1024, 1, 2000, 200, 200, 200, 2000] + - [126, 5629.0] + - - [100, 2048, 1, 512, 100, 100, 100, 512] + - [138, 5238.0] + - - [512, 500, 1, 2048, 512, 512, 512, 2048] + - [147, 6970.0] + - - [128, 2048, 1, 2000, 128, 128, 128, 2000] + - [126, 7206.0] + - - [500, 512, 1, 2048, 500, 500, 500, 2048] + - [138, 6968.0] + - - [200, 500, 1, 2000, 200, 200, 200, 2000] + - [145, 4751.0] + - - [500, 512, 1, 1024, 500, 500, 500, 1024] + - [138, 6834.0] + - - [100, 1024, 1, 100, 100, 100, 100, 100] + - [134, 2338.0] + - - [64, 2000, 1, 500, 64, 64, 64, 500] + - [136, 5333.0] + - - [128, 2048, 1, 2048, 128, 128, 128, 2048] + - [138, 7193.0] + - - [128, 2000, 1, 10, 128, 128, 128, 10] + - [162, 1196.0] + - - [500, 512, 1, 10, 500, 500, 500, 10] + - [125, 1133.0] + - - [200, 512, 1, 512, 200, 200, 200, 512] + - [154, 4290.0] + - - [512, 500, 1, 500, 512, 512, 512, 500] + - [147, 6598.0] + - - [512, 512, 1, 100, 512, 512, 512, 100] + - [155, 4946.0] + - - [500, 500, 1, 512, 500, 500, 500, 512] + - [147, 6356.0] + - - [128, 2048, 1, 500, 128, 128, 128, 500] + - [147, 6687.0] + - - [200, 500, 1, 10, 200, 200, 200, 10] + - [125, 556.0] + - - [100, 2048, 1, 2000, 100, 100, 100, 2000] + - [126, 5622.0] + - - [200, 1024, 1, 10, 200, 200, 200, 10] + - [125, 994.0] + - - [64, 2048, 1, 1024, 64, 64, 64, 1024] + - [136, 6157.0] + - - [100, 2000, 1, 2000, 100, 100, 100, 2000] + - [126, 5481.0] + - - [500, 500, 1, 100, 500, 500, 500, 100] + - [126, 4545.0] + - - [128, 2048, 1, 100, 128, 128, 128, 100] + - [147, 4984.0] + - - [4096, 64, 1, 2048, 4096, 4096, 4096, 2048] + - [147, 7040.0] + - - [4096, 91, 1, 2048, 4096, 4096, 4096, 2048] + - [154, 6421.0] + - - [4096, 86, 1, 3072, 4096, 4096, 4096, 3072] + - [145, 6099.0] + - - [4096, 49, 1, 2048, 4096, 4096, 4096, 2048] + - [147, 5394.0] + - - [4096, 91, 1, 3072, 4096, 4096, 4096, 3072] + - [145, 6468.0] + - - [4096, 64, 1, 3072, 4096, 4096, 4096, 3072] + - [138, 7090.0] + - - [4096, 63, 1, 3072, 4096, 4096, 4096, 3072] + - [126, 6976.0] + - - [4096, 96, 1, 2048, 4096, 4096, 4096, 2048] + - [169, 6766.0] + - - [4096, 32, 1, 2048, 4096, 4096, 4096, 2048] + - [150, 5592.0] + - - [4096, 49, 1, 3072, 4096, 4096, 4096, 3072] + - [126, 5430.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [124, 4309.0] + - - [4096, 86, 1, 2048, 4096, 4096, 4096, 2048] + - [154, 6129.0] + - - [4096, 96, 1, 3072, 4096, 4096, 4096, 3072] + - [169, 6790.0] + - - [4096, 35, 1, 3072, 4096, 4096, 4096, 3072] + - [147, 3886.0] + - - [4096, 50, 1, 2048, 4096, 4096, 4096, 2048] + - [138, 5503.0] + - - [36548, 32, 1, 1024, 36548, 36548, 36548, 1024] + - [163, 8133.0] + - - [4096, 32, 1, 3072, 4096, 4096, 4096, 3072] + - [129, 5571.0] + - - [1024, 243, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 6649.0] + - - [4096, 50, 1, 3072, 4096, 4096, 4096, 3072] + - [138, 5548.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [154, 5569.0] + - - [1024, 216, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 5901.0] + - - [4096, 35, 1, 2048, 4096, 4096, 4096, 2048] + - [126, 3867.0] + - - [4096, 63, 1, 2048, 4096, 4096, 4096, 2048] + - [126, 6934.0] + - - [289, 256, 1, 1568, 289, 289, 289, 1568] + - [136, 3625.0] + - - [3025, 64, 1, 363, 3025, 3025, 3025, 363] + - [124, 5945.0] + - - [784, 32, 32, 192, 784, 784, 784, 192] + - [136, 7038.0] + - - [289, 256, 1, 2016, 289, 289, 289, 2016] + - [136, 3701.0] + - - [21609, 32, 1, 288, 21609, 21609, 21609, 288] + - [163, 7237.0] + - - [1225, 192, 1, 1728, 1225, 1225, 1225, 1728] + - [126, 6423.0] + - - [784, 96, 1, 800, 784, 784, 784, 800] + - [123, 3409.0] + - - [1225, 64, 1, 1200, 1225, 1225, 1225, 1200] + - [123, 3630.0] + - - [729, 192, 1, 1600, 729, 729, 729, 1600] + - [163, 6242.0] + - - [6272, 32, 1, 528, 6272, 6272, 6272, 528] + - [163, 6446.0] + - - [1568, 160, 1, 832, 1568, 1568, 1568, 832] + - [124, 6494.0] + - - [289, 256, 1, 1792, 289, 289, 289, 1792] + - [134, 3691.0] + - - [784, 32, 32, 256, 784, 784, 784, 256] + - [163, 7156.0] + - - [6272, 32, 1, 512, 6272, 6272, 6272, 512] + - [124, 6447.0] + - - [289, 384, 1, 3456, 289, 289, 289, 3456] + - [145, 5367.0] + - - [289, 384, 1, 2592, 289, 289, 289, 2592] + - [124, 5311.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 192] + - [163, 7574.0] + - - [1568, 128, 1, 832, 1568, 1568, 1568, 832] + - [124, 6761.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 288] + - [163, 6060.0] + - - [1001, 128, 1, 2048, 1001, 1001, 1001, 2048] + - [154, 5755.0] + - - [2048, 174, 1, 512, 2048, 2048, 2048, 512] + - [126, 6253.0] + - - [2048, 189, 1, 512, 2048, 2048, 2048, 512] + - [138, 6750.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [163, 4179.0] + - - [64, 103, 16, 103, 64, 64, 64, 103] + - [144, 2447.0] + - - [64, 104, 16, 103, 64, 64, 64, 103] + - [169, 2482.0] + - - [64, 123, 16, 112, 64, 64, 64, 112] + - [169, 3080.0] + - - [64, 123, 16, 123, 64, 64, 64, 123] + - [169, 3228.0] + - - [512, 540, 1, 512, 512, 512, 512, 512] + - [147, 7050.0] + - - [512, 540, 1, 2048, 512, 512, 512, 2048] + - [147, 7590.0] + - - [512, 550, 1, 512, 512, 512, 512, 512] + - [147, 7088.0] + - - [512, 550, 1, 2048, 512, 512, 512, 2048] + - [147, 7725.0] + - - [512, 560, 1, 512, 512, 512, 512, 512] + - [147, 7210.0] + - - [512, 560, 1, 2048, 512, 512, 512, 2048] + - [147, 7850.0] + - - [2048, 160, 1, 512, 2048, 2048, 2048, 512] + - [124, 6684.0] + - - [2048, 184, 1, 512, 2048, 2048, 2048, 512] + - [126, 6594.0] + - - [512, 160, 1, 2048, 512, 512, 512, 2048] + - [144, 3873.0] + - - [512, 174, 1, 2048, 512, 512, 512, 2048] + - [136, 4212.0] + - - [512, 182, 1, 512, 512, 512, 512, 512] + - [136, 3885.0] + - - [512, 184, 1, 512, 512, 512, 512, 512] + - [136, 3928.0] + - - [512, 184, 1, 2048, 512, 512, 512, 2048] + - [136, 4405.0] + - - [512, 189, 1, 512, 512, 512, 512, 512] + - [154, 4035.0] + - - [512, 189, 1, 2048, 512, 512, 512, 2048] + - [145, 4613.0] + - - [512, 198, 1, 2048, 512, 512, 512, 2048] + - [136, 4784.0] + - - [512, 206, 1, 512, 512, 512, 512, 512] + - [136, 4390.0] + - - [512, 207, 1, 2048, 512, 512, 512, 2048] + - [136, 4999.0] + - - [512, 208, 1, 512, 512, 512, 512, 512] + - [136, 4433.0] + - - [512, 208, 1, 2048, 512, 512, 512, 2048] + - [136, 5025.0] + - - [512, 224, 1, 512, 512, 512, 512, 512] + - [136, 4918.0] + - - [512, 245, 1, 2048, 512, 512, 512, 2048] + - [124, 5799.0] + - - [512, 246, 1, 512, 512, 512, 512, 512] + - [169, 5234.0] + - - [512, 246, 1, 2048, 512, 512, 512, 2048] + - [124, 5820.0] + - - [512, 264, 1, 512, 512, 512, 512, 512] + - [154, 5554.0] + - - [512, 264, 1, 2048, 512, 512, 512, 2048] + - [124, 5838.0] + - - [512, 401, 1, 2048, 512, 512, 512, 2048] + - [136, 7252.0] + - - [512, 439, 1, 2048, 512, 512, 512, 2048] + - [147, 6167.0] + - - [512, 443, 1, 2048, 512, 512, 512, 2048] + - [147, 6217.0] + - - [512, 446, 1, 2048, 512, 512, 512, 2048] + - [147, 6246.0] + - - [512, 455, 1, 512, 512, 512, 512, 512] + - [147, 6012.0] + - - [512, 465, 1, 512, 512, 512, 512, 512] + - [147, 6138.0] + - - [512, 465, 1, 2048, 512, 512, 512, 2048] + - [147, 6536.0] + - - [512, 468, 1, 512, 512, 512, 512, 512] + - [147, 6177.0] + - - [512, 468, 1, 2048, 512, 512, 512, 2048] + - [147, 6573.0] + - - [512, 476, 1, 512, 512, 512, 512, 512] + - [147, 6289.0] + - - [512, 493, 1, 512, 512, 512, 512, 512] + - [147, 6468.0] + - - [512, 493, 1, 2048, 512, 512, 512, 2048] + - [147, 6902.0] + - - [512, 495, 1, 2048, 512, 512, 512, 2048] + - [147, 6924.0] + - - [512, 511, 1, 2048, 512, 512, 512, 2048] + - [147, 7139.0] + - - [512, 512, 1, 2048, 512, 512, 512, 2048] + - [147, 7193.0] + - - [64, 59, 512, 59, 64, 64, 64, 59] + - [145, 6814.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [145, 6847.0] + - - [256, 1024, 1, 1, 256, 256, 256, 1] + - [162, 151.0] + - - [257, 1024, 1, 4096, 257, 257, 257, 4096] + - [152, 5932.0] + - - [512, 215, 1, 2048, 512, 512, 512, 2048] + - [136, 5145.0] + - - [512, 256, 1, 2048, 512, 512, 512, 2048] + - [124, 6062.0] + - - [560, 200, 1, 1024, 560, 560, 560, 1024] + - [169, 5175.0] + - - [768, 215, 1, 2048, 768, 768, 768, 2048] + - [145, 5820.0] + - - [768, 256, 1, 2048, 768, 768, 768, 2048] + - [145, 6945.0] + - - [32, 33, 1600, 33, 32, 32, 32, 33] + - [122, 3186.0] + - - [512, 512, 1, 64, 512, 512, 512, 64] + - [147, 4346.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 192] + - [163, 8004.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 192] + - [154, 6188.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 256] + - [163, 6200.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 288] + - [124, 6209.0] + - - [49, 2048, 64, 512, 49, 49, 49, 512] + - [136, 6730.0] + - - [49, 512, 64, 2048, 49, 49, 49, 2048] + - [142, 5682.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 192] + - [136, 5987.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 256] + - [124, 6044.0] + - - [49, 2048, 32, 512, 49, 49, 49, 512] + - [136, 6565.0] + - - [49, 512, 32, 2048, 49, 49, 49, 2048] + - [136, 6000.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [136, 5372.0] + - - [100, 128, 18, 512, 100, 100, 100, 512] + - [170, 5760.0] + - - [100, 128, 19, 512, 100, 100, 100, 512] + - [169, 4815.0] + - - [1444, 128, 1, 576, 1444, 1444, 1444, 576] + - [124, 6035.0] + - - [361, 512, 1, 2304, 361, 361, 361, 2304] + - [145, 6560.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [138, 6304.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [147, 6323.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [124, 6472.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [136, 6300.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [173, 7177.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [124, 4475.0] + - - [64, 35, 4608, 32, 64, 64, 64, 32] + - [136, 4799.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [124, 4638.0] + - - [256, 864, 1, 128, 256, 256, 256, 128] + - [124, 4596.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 1024] + - [169, 6411.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 2048] + - [159, 5848.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 1024] + - [169, 6206.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 2048] + - [142, 5679.0] + - - [3136, 64, 1, 576, 3136, 3136, 3136, 576] + - [124, 6317.0] + - - [784, 128, 1, 1152, 784, 784, 784, 1152] + - [124, 4541.0] + - - [49, 2048, 128, 512, 49, 49, 49, 512] + - [136, 6809.0] + - - [49, 2048, 256, 512, 49, 49, 49, 512] + - [154, 6857.0] + - - [49, 512, 128, 2048, 49, 49, 49, 2048] + - [142, 5841.0] + - - [49, 512, 256, 2048, 49, 49, 49, 2048] + - [142, 5898.0] + - - [1024, 128, 1, 2, 1024, 1024, 1024, 2] + - [162, 142.0] + - - [1024, 96, 1, 2, 1024, 1024, 1024, 2] + - [162, 124.0] + - - [1909283, 40, 1, 40, 1909283, 1909283, 1909283, 40] + - [162, 3459.0] + - - [3818566, 40, 1, 40, 3818566, 3818566, 3818566, 40] + - [153, 3473.0] + - - [2560, 35, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3012.0] + - - [2560, 36, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3102.0] + - - [2560, 39, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3358.0] + - - [2560, 40, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3442.0] + - - [2560, 42, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3621.0] + - - [2560, 43, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3708.0] + - - [2560, 44, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3787.0] + - - [2560, 46, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 3956.0] + - - [2560, 48, 1, 29000, 2560, 2560, 2560, 29000] + - [144, 4125.0] + - - [2560, 49, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4087.0] + - - [2560, 50, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4162.0] + - - [2560, 51, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4257.0] + - - [2560, 53, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4412.0] + - - [2560, 54, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4501.0] + - - [2560, 55, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4566.0] + - - [2560, 56, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4661.0] + - - [2560, 57, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4751.0] + - - [2560, 58, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4824.0] + - - [2560, 59, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 4923.0] + - - [2560, 61, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 5078.0] + - - [2560, 63, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 5211.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [178, 3533.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 1280] + - [182, 1151.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [183, 3064.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 256] + - [177, 613.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 3328] + - [181, 1675.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [182, 2329.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 1280] + - [182, 765.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 1280] + - [184, 1884.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [184, 2386.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 1280] + - [182, 642.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [182, 377.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [181, 2266.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [202, 5717.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [182, 479.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 256] + - [177, 503.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [181, 558.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [182, 754.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [182, 1504.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [141, 112.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [178, 4242.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 256] + - [179, 854.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 3328] + - [181, 1262.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 1280] + - [183, 1658.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [182, 2769.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 128] + - [179, 802.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [193, 617.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 256] + - [149, 312.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [181, 1878.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [182, 239.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 1280] + - [203, 1507.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [183, 3369.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 3328] + - [182, 555.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [181, 473.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [190, 5146.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [202, 4353.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 128] + - [165, 218.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 256] + - [149, 403.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 128] + - [177, 914.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 128] + - [165, 441.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 3328] + - [205, 1823.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 128] + - [185, 711.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [182, 955.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 3328] + - [181, 1040.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 128] + - [177, 357.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 128] + - [165, 285.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [191, 1122.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [202, 5160.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [179, 149.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [191, 1257.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 1280] + - [182, 488.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 256] + - [177, 1241.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 128] + - [185, 620.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 3328] + - [182, 731.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 256] + - [185, 749.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 3328] + - [181, 838.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 3328] + - [184, 2082.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 1280] + - [182, 1365.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 128] + - [185, 527.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 256] + - [196, 973.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 3328] + - [182, 1498.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [180, 4622.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [181, 941.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 1280] + - [182, 946.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 256] + - [179, 1112.0] + - - [4096, 29, 1, 2048, 4096, 4096, 4096, 2048] + - [202, 4993.0] + - - [4096, 25, 1, 2048, 4096, 4096, 4096, 2048] + - [190, 4317.0] + - - [4096, 29, 1, 3072, 4096, 4096, 4096, 3072] + - [202, 5065.0] + - - [4096, 24, 1, 2048, 4096, 4096, 4096, 2048] + - [202, 5128.0] + - - [36548, 1, 1, 1024, 36548, 36548, 36548, 1024] + - [175, 409.0] + - - [4096, 27, 1, 2048, 4096, 4096, 4096, 2048] + - [202, 4565.0] + - - [4096, 1, 1, 2048, 4096, 4096, 4096, 2048] + - [182, 345.0] + - - [4096, 24, 1, 3072, 4096, 4096, 4096, 3072] + - [202, 5209.0] + - - [4096, 27, 1, 3072, 4096, 4096, 4096, 3072] + - [202, 4712.0] + - - [36548, 25, 1, 1024, 36548, 36548, 36548, 1024] + - [195, 4912.0] + - - [4096, 1, 1, 3072, 4096, 4096, 4096, 3072] + - [182, 344.0] + - - [4096, 25, 1, 3072, 4096, 4096, 4096, 3072] + - [202, 4359.0] + - - [36548, 24, 1, 1024, 36548, 36548, 36548, 1024] + - [190, 5711.0] + - - [6272, 16, 1, 480, 6272, 6272, 6272, 480] + - [190, 4068.0] + - - [1568, 32, 1, 832, 1568, 1568, 1568, 832] + - [128, 3292.0] + - - [1568, 48, 1, 832, 1568, 1568, 1568, 832] + - [188, 3723.0] + - - [6272, 24, 1, 512, 6272, 6272, 6272, 512] + - [180, 4781.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 512] + - [128, 138.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [174, 5.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [182, 382.0] + - - [2560, 4, 1, 2, 2560, 2560, 2560, 2] + - [174, 14.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [182, 886.0] + - - [12288, 12, 2, 256, 12288, 12288, 12288, 256] + - [178, 4157.0] + - - [12288, 3, 2, 256, 12288, 12288, 12288, 256] + - [175, 1535.0] + - - [51520, 12, 2, 256, 51520, 51520, 51520, 256] + - [202, 4811.0] + - - [51520, 3, 2, 256, 51520, 51520, 51520, 256] + - [185, 2204.0] + - - [15200, 12, 2, 256, 15200, 15200, 15200, 256] + - [202, 4214.0] + - - [15200, 3, 2, 256, 15200, 15200, 15200, 256] + - [194, 1612.0] + - - [3456, 3, 2, 256, 3456, 3456, 3456, 256] + - [176, 922.0] + - - [13600, 12, 2, 256, 13600, 13600, 13600, 256] + - [200, 4224.0] + - - [12880, 3, 2, 256, 12880, 12880, 12880, 256] + - [175, 1529.0] + - - [3400, 3, 2, 256, 3400, 3400, 3400, 256] + - [179, 900.0] + - - [12880, 12, 2, 256, 12880, 12880, 12880, 256] + - [190, 4037.0] + - - [13824, 12, 2, 256, 13824, 13824, 13824, 256] + - [200, 4307.0] + - - [13824, 3, 2, 256, 13824, 13824, 13824, 256] + - [206, 1611.0] + - - [13600, 3, 2, 256, 13600, 13600, 13600, 256] + - [175, 1587.0] + - - [3456, 12, 2, 256, 3456, 3456, 3456, 256] + - [189, 2893.0] + - - [3800, 3, 2, 256, 3800, 3800, 3800, 256] + - [179, 993.0] + - - [3400, 12, 2, 256, 3400, 3400, 3400, 256] + - [201, 2831.0] + - - [3800, 12, 2, 256, 3800, 3800, 3800, 256] + - [202, 3016.0] + - - [55296, 3, 2, 256, 55296, 55296, 55296, 256] + - [185, 2367.0] + - - [3220, 3, 2, 256, 3220, 3220, 3220, 256] + - [179, 868.0] + - - [3072, 3, 2, 256, 3072, 3072, 3072, 256] + - [176, 849.0] + - - [3220, 12, 2, 256, 3220, 3220, 3220, 256] + - [190, 2918.0] + - - [3072, 12, 2, 256, 3072, 3072, 3072, 256] + - [202, 2809.0] + - - [54400, 3, 2, 256, 54400, 54400, 54400, 256] + - [197, 2378.0] + - - [60800, 12, 2, 256, 60800, 60800, 60800, 256] + - [198, 4808.0] + - - [60800, 3, 2, 256, 60800, 60800, 60800, 256] + - [199, 2363.0] + - - [1909283, 11, 1, 11, 1909283, 1909283, 1909283, 11] + - [186, 2039.0] + - - [3818566, 11, 1, 11, 3818566, 3818566, 3818566, 11] + - [187, 1125.0] + - - [2048, 8, 1, 2, 2048, 2048, 2048, 2] + - [144, 18.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [204, 1446.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [174, 7.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [182, 443.0] + - - [2560, 27, 1, 29000, 2560, 2560, 2560, 29000] + - [192, 3625.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 3328] + - [156, 543.0] + - - [35, 1500, 1, 2560, 35, 35, 35, 2560] + - [133, 2793.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 1280] + - [133, 612.0] + - - [4, 3584, 1, 128, 4, 4, 4, 128] + - [209, 483.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 3328] + - [156, 449.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 3328] + - [133, 938.0] + - - [4, 4288, 1, 128, 4, 4, 4, 128] + - [209, 563.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 1280] + - [133, 914.0] + - - [4, 5056, 1, 256, 4, 4, 4, 256] + - [209, 740.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 3328] + - [156, 722.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 1280] + - [209, 896.0] + - - [35, 1500, 1, 2048, 35, 35, 35, 2048] + - [151, 2725.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 3328] + - [156, 645.0] + - - [4, 1856, 1, 256, 4, 4, 4, 256] + - [133, 385.0] + - - [4, 2944, 1, 256, 4, 4, 4, 256] + - [133, 560.0] + - - [4, 6784, 1, 128, 4, 4, 4, 128] + - [208, 668.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 1280] + - [133, 782.0] + - - [4, 5888, 1, 256, 4, 4, 4, 256] + - [207, 785.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 3328] + - [208, 980.0] + - - [4, 6784, 1, 256, 4, 4, 4, 256] + - [207, 733.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1280] + - [156, 427.0] + - - [4, 3584, 1, 256, 4, 4, 4, 256] + - [133, 622.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 1280] + - [133, 695.0] + - - [4, 1408, 1, 256, 4, 4, 4, 256] + - [133, 300.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 3328] + - [133, 918.0] + - - [4, 2368, 1, 128, 4, 4, 4, 128] + - [133, 352.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 1280] + - [208, 952.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1280] + - [156, 520.0] + - - [4, 1856, 1, 128, 4, 4, 4, 128] + - [133, 281.0] + - - [4, 2944, 1, 128, 4, 4, 4, 128] + - [208, 426.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 1280] + - [133, 882.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 3328] + - [207, 914.0] + - - [4, 5056, 1, 128, 4, 4, 4, 128] + - [208, 602.0] + - - [4, 4288, 1, 256, 4, 4, 4, 256] + - [207, 711.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3328] + - [133, 802.0] + - - [4, 2368, 1, 256, 4, 4, 4, 256] + - [133, 466.0] + - - [4, 5888, 1, 128, 4, 4, 4, 128] + - [208, 655.0] + - - [4, 1408, 1, 128, 4, 4, 4, 128] + - [133, 218.0] + - - [16, 2000, 1, 2048, 16, 16, 16, 2048] + - [171, 2291.0] + - - [2, 2048, 1, 2000, 2, 2, 2, 2000] + - [127, 288.0] + - - [32, 2000, 1, 2048, 32, 32, 32, 2048] + - [135, 3591.0] + - - [10, 2000, 1, 1024, 10, 10, 10, 1024] + - [156, 1358.0] + - - [2, 2000, 1, 100, 2, 2, 2, 100] + - [121, 126.0] + - - [10, 2000, 1, 512, 10, 10, 10, 512] + - [156, 1243.0] + - - [32, 2000, 1, 500, 32, 32, 32, 500] + - [123, 3279.0] + - - [32, 2000, 1, 1024, 32, 32, 32, 1024] + - [153, 3562.0] + - - [4, 2048, 1, 500, 4, 4, 4, 500] + - [127, 480.0] + - - [16, 2000, 1, 500, 16, 16, 16, 500] + - [148, 1874.0] + - - [4, 2048, 1, 100, 4, 4, 4, 100] + - [133, 258.0] + - - [16, 2000, 1, 100, 16, 16, 16, 100] + - [148, 994.0] + - - [4, 2000, 1, 10, 4, 4, 4, 10] + - [125, 48.0] + - - [10, 2000, 1, 10, 10, 10, 10, 10] + - [125, 119.0] + - - [2, 2048, 1, 512, 2, 2, 2, 512] + - [156, 258.0] + - - [10, 2048, 1, 100, 10, 10, 10, 100] + - [125, 621.0] + - - [8, 2048, 1, 100, 8, 8, 8, 100] + - [133, 512.0] + - - [2, 2048, 1, 1024, 2, 2, 2, 1024] + - [156, 279.0] + - - [16, 2000, 1, 1024, 16, 16, 16, 1024] + - [156, 2190.0] + - - [10, 2000, 1, 2000, 10, 10, 10, 2000] + - [127, 1404.0] + - - [8, 2000, 1, 500, 8, 8, 8, 500] + - [148, 937.0] + - - [16, 2000, 1, 2000, 16, 16, 16, 2000] + - [127, 2255.0] + - - [10, 2048, 1, 2048, 10, 10, 10, 2048] + - [171, 1475.0] + - - [8, 2000, 1, 512, 8, 8, 8, 512] + - [156, 999.0] + - - [2, 2000, 1, 2048, 2, 2, 2, 2048] + - [156, 289.0] + - - [16, 2048, 1, 500, 16, 16, 16, 500] + - [127, 1914.0] + - - [8, 2048, 1, 1024, 8, 8, 8, 1024] + - [156, 1110.0] + - - [2, 2000, 1, 500, 2, 2, 2, 500] + - [148, 235.0] + - - [32, 2048, 1, 100, 32, 32, 32, 100] + - [137, 2010.0] + - - [10, 2048, 1, 500, 10, 10, 10, 500] + - [127, 1193.0] + - - [4, 2000, 1, 2048, 4, 4, 4, 2048] + - [171, 575.0] + - - [8, 2000, 1, 1024, 8, 8, 8, 1024] + - [156, 1089.0] + - - [32, 2048, 1, 512, 32, 32, 32, 512] + - [153, 3369.0] + - - [32, 2048, 1, 1024, 32, 32, 32, 1024] + - [135, 3631.0] + - - [32, 2048, 1, 500, 32, 32, 32, 500] + - [123, 3371.0] + - - [10, 2048, 1, 1024, 10, 10, 10, 1024] + - [156, 1394.0] + - - [8, 2048, 1, 2048, 8, 8, 8, 2048] + - [156, 1173.0] + - - [16, 2048, 1, 2048, 16, 16, 16, 2048] + - [171, 2350.0] + - - [8, 2000, 1, 10, 8, 8, 8, 10] + - [125, 99.0] + - - [4, 2000, 1, 2000, 4, 4, 4, 2000] + - [127, 561.0] + - - [8, 2048, 1, 512, 8, 8, 8, 512] + - [156, 1016.0] + - - [8, 2000, 1, 2048, 8, 8, 8, 2048] + - [156, 1147.0] + - - [32, 2048, 1, 2000, 32, 32, 32, 2000] + - [153, 3830.0] + - - [16, 2000, 1, 10, 16, 16, 16, 10] + - [125, 195.0] + - - [8, 2048, 1, 2000, 8, 8, 8, 2000] + - [127, 1149.0] + - - [4, 2048, 1, 2048, 4, 4, 4, 2048] + - [156, 586.0] + - - [10, 2048, 1, 2000, 10, 10, 10, 2000] + - [127, 1438.0] + - - [8, 2000, 1, 100, 8, 8, 8, 100] + - [127, 503.0] + - - [2, 2000, 1, 2000, 2, 2, 2, 2000] + - [127, 281.0] + - - [16, 2048, 1, 1024, 16, 16, 16, 1024] + - [156, 2228.0] + - - [32, 2000, 1, 2000, 32, 32, 32, 2000] + - [135, 3754.0] + - - [32, 2048, 1, 2048, 32, 32, 32, 2048] + - [168, 3653.0] + - - [2, 2048, 1, 10, 2, 2, 2, 10] + - [125, 25.0] + - - [4, 2048, 1, 512, 4, 4, 4, 512] + - [156, 514.0] + - - [4, 2048, 1, 10, 4, 4, 4, 10] + - [125, 50.0] + - - [16, 2048, 1, 100, 16, 16, 16, 100] + - [148, 1011.0] + - - [4, 2000, 1, 500, 4, 4, 4, 500] + - [148, 469.0] + - - [10, 2000, 1, 500, 10, 10, 10, 500] + - [156, 1166.0] + - - [32, 2000, 1, 512, 32, 32, 32, 512] + - [153, 3297.0] + - - [2, 2000, 1, 1024, 2, 2, 2, 1024] + - [156, 273.0] + - - [2, 2000, 1, 512, 2, 2, 2, 512] + - [156, 250.0] + - - [4, 2048, 1, 1024, 4, 4, 4, 1024] + - [156, 560.0] + - - [8, 2048, 1, 500, 8, 8, 8, 500] + - [148, 957.0] + - - [4, 2048, 1, 2000, 4, 4, 4, 2000] + - [127, 575.0] + - - [8, 2000, 1, 2000, 8, 8, 8, 2000] + - [127, 1125.0] + - - [4, 2000, 1, 1024, 4, 4, 4, 1024] + - [156, 545.0] + - - [32, 2000, 1, 100, 32, 32, 32, 100] + - [125, 1939.0] + - - [2, 2048, 1, 100, 2, 2, 2, 100] + - [133, 129.0] + - - [8, 2048, 1, 10, 8, 8, 8, 10] + - [125, 100.0] + - - [2, 2048, 1, 2048, 2, 2, 2, 2048] + - [171, 293.0] + - - [10, 2000, 1, 2048, 10, 10, 10, 2048] + - [156, 1440.0] + - - [16, 2048, 1, 2000, 16, 16, 16, 2000] + - [127, 2306.0] + - - [10, 2048, 1, 512, 10, 10, 10, 512] + - [156, 1282.0] + - - [16, 2048, 1, 512, 16, 16, 16, 512] + - [156, 2056.0] + - - [2, 2000, 1, 10, 2, 2, 2, 10] + - [125, 24.0] + - - [4, 2000, 1, 100, 4, 4, 4, 100] + - [133, 250.0] + - - [16, 2000, 1, 512, 16, 16, 16, 512] + - [156, 2008.0] + - - [32, 2048, 1, 10, 32, 32, 32, 10] + - [125, 395.0] + - - [10, 2048, 1, 10, 10, 10, 10, 10] + - [125, 123.0] + - - [4, 2000, 1, 512, 4, 4, 4, 512] + - [156, 501.0] + - - [16, 2048, 1, 10, 16, 16, 16, 10] + - [123, 197.0] + - - [32, 2000, 1, 10, 32, 32, 32, 10] + - [130, 386.0] + - - [10, 2000, 1, 100, 10, 10, 10, 100] + - [133, 610.0] + - - [2, 2048, 1, 500, 2, 2, 2, 500] + - [127, 239.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [212, 188.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [213, 2893.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [214, 375.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [215, 167.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [214, 1470.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [217, 745.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [215, 2632.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [216, 336.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [216, 1331.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [214, 670.0] + - - [1024, 20, 1, 30522, 1024, 1024, 1024, 30522] + - [211, 3423.0] + - - [49, 512, 1, 4608, 49, 49, 49, 4608] + - [210, 3264.0] + - - [64, 512, 1, 1, 64, 64, 64, 1] + - [126, 19.0] + - - [1024, 32, 1, 2, 1024, 1024, 1024, 2] + - [130, 44.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 2467.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [128, 1910.0] + - - [768, 32, 1, 2, 768, 768, 768, 2] + - [162, 34.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [149, 3183.0] + - - [768, 64, 1, 2, 768, 768, 768, 2] + - [162, 66.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 1589.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [123, 3775.0] + - - [32, 200, 1, 1, 32, 32, 32, 1] + - [121, 4.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 332.0] + - - [1024, 4, 1, 2, 1024, 1024, 1024, 2] + - [121, 5.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [165, 987.0] + - - [768, 16, 1, 2, 768, 768, 768, 2] + - [144, 17.0] + - - [768, 8, 1, 768, 768, 768, 768, 768] + - [165, 486.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 499.0] + - - [1024, 6, 1, 2, 1024, 1024, 1024, 2] + - [121, 8.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 666.0] + - - [4, 704, 1, 1280, 4, 4, 4, 1280] + - [158, 230.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [149, 147.0] + - - [64, 4, 1, 256, 64, 64, 64, 256] + - [127, 14.0] + - - [64, 704, 1, 128, 64, 64, 64, 128] + - [135, 1769.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [128, 2392.0] + - - [128, 4, 1, 1280, 128, 128, 128, 1280] + - [141, 44.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [149, 1872.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [135, 3732.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [153, 3097.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [128, 353.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 128] + - [125, 2317.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [135, 3888.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [127, 11.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [162, 3660.0] + - - [704, 4, 1, 1280, 704, 704, 704, 1280] + - [149, 241.0] + - - [64, 256, 1, 128, 64, 64, 64, 128] + - [135, 677.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [135, 3833.0] + - - [64, 1024, 1, 128, 64, 64, 64, 128] + - [135, 2356.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [165, 2893.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [149, 2341.0] + - - [448, 4, 1, 256, 448, 448, 448, 256] + - [149, 99.0] + - - [256, 4, 1, 1280, 256, 256, 256, 1280] + - [128, 87.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [128, 1195.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [165, 381.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [149, 37.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [149, 3268.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [149, 57.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [141, 1423.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 256] + - [149, 232.0] + - - [4, 704, 1, 256, 4, 4, 4, 256] + - [133, 153.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [149, 3101.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [123, 2640.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [165, 2692.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [165, 2550.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [149, 1336.0] + - - [4, 448, 1, 128, 4, 4, 4, 128] + - [135, 71.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [172, 768.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [128, 1545.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [158, 1864.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [127, 5.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 3328] + - [132, 375.0] + - - [4, 4, 1, 256, 4, 4, 4, 256] + - [121, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [128, 999.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [149, 2706.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [158, 714.0] + - - [4, 448, 1, 3328, 4, 4, 4, 3328] + - [172, 167.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [168, 2975.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [135, 2307.0] + - - [704, 64, 1, 128, 704, 704, 704, 128] + - [135, 1769.0] + - - [448, 4, 1, 1280, 448, 448, 448, 1280] + - [149, 152.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [149, 146.0] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [149, 1540.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [123, 2640.0] + - - [448, 64, 1, 128, 448, 448, 448, 128] + - [149, 1176.0] + - - [4, 448, 1, 256, 4, 4, 4, 256] + - [135, 98.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [149, 3268.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [168, 2943.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 3328] + - [141, 354.0] + - - [4, 704, 1, 128, 4, 4, 4, 128] + - [133, 111.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [128, 340.0] + - - [704, 4, 1, 128, 704, 704, 704, 128] + - [149, 109.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [149, 2508.0] + - - [448, 4, 1, 3328, 448, 448, 448, 3328] + - [141, 165.0] + - - [256, 4, 1, 3328, 256, 256, 256, 3328] + - [132, 94.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [133, 55.0] + - - [4, 64, 1, 1280, 4, 4, 4, 1280] + - [128, 22.0] + - - [4, 4, 1, 128, 4, 4, 4, 128] + - [121, 1.0] + - - [4, 128, 1, 256, 4, 4, 4, 256] + - [133, 28.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [144, 3472.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [149, 1638.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [158, 1421.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [128, 2267.0] + - - [64, 4, 1, 128, 64, 64, 64, 128] + - [128, 10.0] + - - [256, 64, 1, 128, 256, 256, 256, 128] + - [127, 681.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [158, 240.0] + - - [4, 704, 1, 3328, 4, 4, 4, 3328] + - [141, 246.0] + - - [4, 4, 1, 1280, 4, 4, 4, 1280] + - [121, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [128, 728.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 128] + - [149, 161.0] + - - [4, 64, 1, 128, 4, 4, 4, 128] + - [133, 10.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [128, 708.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [141, 1425.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [141, 73.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [158, 483.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 1280] + - [149, 342.0] + - - [35, 700, 1, 2048, 35, 35, 35, 2048] + - [171, 1813.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [153, 2344.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [162, 3327.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [128, 774.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [149, 1653.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [149, 1195.0] + - - [4, 256, 1, 128, 4, 4, 4, 128] + - [133, 40.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [141, 596.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [162, 2943.0] + - - [4, 4, 1, 3328, 4, 4, 4, 3328] + - [121, 1.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1280] + - [158, 331.0] + - - [704, 4, 1, 256, 704, 704, 704, 256] + - [149, 157.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [158, 481.0] + - - [128, 4, 1, 3328, 128, 128, 128, 3328] + - [165, 48.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [133, 20.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [128, 11.0] + - - [4, 128, 1, 3328, 4, 4, 4, 3328] + - [128, 48.0] + - - [256, 256, 1, 128, 256, 256, 256, 128] + - [125, 2370.0] + - - [704, 4, 1, 3328, 704, 704, 704, 3328] + - [128, 260.0] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [144, 3327.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [162, 3812.0] + - - [256, 4, 1, 128, 256, 256, 256, 128] + - [165, 40.0] + - - [4, 1024, 1, 128, 4, 4, 4, 128] + - [135, 161.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [149, 1517.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [123, 2121.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [149, 1344.0] + - - [128, 4, 1, 256, 128, 128, 128, 256] + - [128, 28.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [144, 3762.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [165, 2919.0] + - - [4, 448, 1, 1280, 4, 4, 4, 1280] + - [141, 153.0] + - - [448, 4, 1, 128, 448, 448, 448, 128] + - [139, 68.0] + - - [4, 256, 1, 3328, 4, 4, 4, 3328] + - [128, 95.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [133, 20.0] + - - [4, 256, 1, 1280, 4, 4, 4, 1280] + - [141, 87.0] + - - [64, 4, 1, 3328, 64, 64, 64, 3328] + - [149, 24.0] + - - [4, 64, 1, 3328, 4, 4, 4, 3328] + - [128, 24.0] + - - [35, 700, 1, 2560, 35, 35, 35, 2560] + - [127, 1848.0] + - - [4, 1024, 1, 256, 4, 4, 4, 256] + - [141, 222.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [158, 958.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [149, 295.0] + - - [4, 64, 1, 256, 4, 4, 4, 256] + - [141, 14.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [123, 2159.0] + - - [64, 448, 1, 128, 64, 64, 64, 128] + - [135, 1176.0] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [123, 3476.0] + - - [4, 128, 1, 1280, 4, 4, 4, 1280] + - [141, 44.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [149, 340.0] + - - [64, 64, 1, 128, 64, 64, 64, 128] + - [149, 170.0] + - - [64, 4, 1, 1280, 64, 64, 64, 1280] + - [149, 22.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [149, 74.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [158, 962.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [162, 2874.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [163, 3740.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [163, 4261.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [123, 1950.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [136, 1371.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [162, 4183.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [169, 1756.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [163, 5841.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [123, 2586.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [169, 4778.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [123, 2278.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [162, 3459.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [169, 5467.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [162, 3166.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [162, 3738.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [163, 6145.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [145, 4123.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [169, 5122.0] + - - [8, 500, 1, 512, 8, 8, 8, 512] + - [141, 281.0] + - - [32, 512, 1, 512, 32, 32, 32, 512] + - [158, 1202.0] + - - [8, 512, 1, 500, 8, 8, 8, 500] + - [158, 285.0] + - - [8, 500, 1, 1024, 8, 8, 8, 1024] + - [141, 331.0] + - - [64, 1024, 1, 100, 64, 64, 64, 100] + - [125, 2035.0] + - - [64, 1024, 1, 500, 64, 64, 64, 500] + - [144, 3392.0] + - - [64, 1024, 1, 1024, 64, 64, 64, 1024] + - [168, 3663.0] + - - [2, 500, 1, 2048, 2, 2, 2, 2048] + - [141, 90.0] + - - [16, 512, 1, 10, 16, 16, 16, 10] + - [127, 53.0] + - - [8, 512, 1, 10, 8, 8, 8, 10] + - [127, 25.0] + - - [16, 500, 1, 2048, 16, 16, 16, 2048] + - [158, 720.0] + - - [10, 100, 1, 500, 10, 10, 10, 500] + - [158, 70.0] + - - [16, 100, 1, 10, 16, 16, 16, 10] + - [130, 10.0] + - - [2, 100, 1, 2000, 2, 2, 2, 2000] + - [128, 18.0] + - - [256, 100, 1, 2048, 256, 256, 256, 2048] + - [149, 2214.0] + - - [2, 512, 1, 512, 2, 2, 2, 512] + - [141, 72.0] + - - [2, 100, 1, 10, 2, 2, 2, 10] + - [121, 1.0] + - - [200, 100, 1, 100, 200, 200, 200, 100] + - [168, 671.0] + - - [500, 100, 1, 100, 500, 500, 500, 100] + - [133, 1553.0] + - - [4, 100, 1, 10, 4, 4, 4, 10] + - [121, 2.0] + - - [32, 100, 1, 512, 32, 32, 32, 512] + - [149, 226.0] + - - [16, 1024, 1, 512, 16, 16, 16, 512] + - [158, 1131.0] + - - [4, 1024, 1, 1024, 4, 4, 4, 1024] + - [158, 323.0] + - - [4, 512, 1, 10, 4, 4, 4, 10] + - [127, 13.0] + - - [128, 100, 1, 10, 128, 128, 128, 10] + - [130, 78.0] + - - [4, 512, 1, 2048, 4, 4, 4, 2048] + - [141, 184.0] + - - [10, 1024, 1, 2000, 10, 10, 10, 2000] + - [149, 848.0] + - - [256, 100, 1, 100, 256, 256, 256, 100] + - [167, 831.0] + - - [64, 1024, 1, 2048, 64, 64, 64, 2048] + - [168, 3783.0] + - - [16, 1024, 1, 100, 16, 16, 16, 100] + - [149, 529.0] + - - [32, 1024, 1, 1024, 32, 32, 32, 1024] + - [158, 2601.0] + - - [8, 100, 1, 500, 8, 8, 8, 500] + - [128, 56.0] + - - [10, 512, 1, 512, 10, 10, 10, 512] + - [141, 355.0] + - - [8, 500, 1, 10, 8, 8, 8, 10] + - [123, 24.0] + - - [16, 1024, 1, 10, 16, 16, 16, 10] + - [127, 101.0] + - - [16, 512, 1, 2048, 16, 16, 16, 2048] + - [158, 737.0] + - - [128, 512, 1, 2048, 128, 128, 128, 2048] + - [135, 3813.0] + - - [128, 512, 1, 100, 128, 128, 128, 100] + - [125, 1998.0] + - - [64, 500, 1, 2048, 64, 64, 64, 2048] + - [158, 2708.0] + - - [500, 100, 1, 10, 500, 500, 500, 10] + - [130, 287.0] + - - [64, 100, 1, 2048, 64, 64, 64, 2048] + - [158, 574.0] + - - [64, 100, 1, 10, 64, 64, 64, 10] + - [130, 39.0] + - - [16, 512, 1, 500, 16, 16, 16, 500] + - [128, 566.0] + - - [200, 100, 1, 2000, 200, 200, 200, 2000] + - [149, 1733.0] + - - [2, 100, 1, 512, 2, 2, 2, 512] + - [128, 14.0] + - - [32, 512, 1, 100, 32, 32, 32, 100] + - [133, 546.0] + - - [16, 512, 1, 1024, 16, 16, 16, 1024] + - [141, 672.0] + - - [4, 1024, 1, 512, 4, 4, 4, 512] + - [158, 283.0] + - - [2, 500, 1, 500, 2, 2, 2, 500] + - [128, 70.0] + - - [32, 100, 1, 100, 32, 32, 32, 100] + - [133, 107.0] + - - [100, 500, 1, 2000, 100, 100, 100, 2000] + - [123, 2924.0] + - - [10, 512, 1, 10, 10, 10, 10, 10] + - [127, 31.0] + - - [100, 500, 1, 2048, 100, 100, 100, 2048] + - [153, 2919.0] + - - [2, 100, 1, 1024, 2, 2, 2, 1024] + - [128, 16.0] + - - [32, 512, 1, 1024, 32, 32, 32, 1024] + - [172, 1380.0] + - - [256, 100, 1, 1024, 256, 256, 256, 1024] + - [165, 2054.0] + - - [128, 100, 1, 100, 128, 128, 128, 100] + - [133, 435.0] + - - [32, 512, 1, 10, 32, 32, 32, 10] + - [130, 104.0] + - - [128, 100, 1, 1024, 128, 128, 128, 1024] + - [149, 1067.0] + - - [16, 500, 1, 2000, 16, 16, 16, 2000] + - [149, 702.0] + - - [64, 500, 1, 500, 64, 64, 64, 500] + - [149, 2162.0] + - - [128, 512, 1, 1024, 128, 128, 128, 1024] + - [135, 3679.0] + - - [128, 512, 1, 2000, 128, 128, 128, 2000] + - [123, 3889.0] + - - [2, 512, 1, 10, 2, 2, 2, 10] + - [121, 6.0] + - - [10, 512, 1, 500, 10, 10, 10, 500] + - [165, 355.0] + - - [4, 1024, 1, 2000, 4, 4, 4, 2000] + - [149, 339.0] + - - [256, 100, 1, 2000, 256, 256, 256, 2000] + - [149, 2218.0] + - - [100, 100, 1, 10, 100, 100, 100, 10] + - [130, 62.0] + - - [128, 512, 1, 10, 128, 128, 128, 10] + - [130, 381.0] + - - [256, 100, 1, 500, 256, 256, 256, 500] + - [149, 1783.0] + - - [64, 100, 1, 512, 64, 64, 64, 512] + - [141, 453.0] + - - [64, 512, 1, 500, 64, 64, 64, 500] + - [149, 2226.0] + - - [8, 100, 1, 512, 8, 8, 8, 512] + - [141, 56.0] + - - [32, 100, 1, 500, 32, 32, 32, 500] + - [141, 223.0] + - - [32, 500, 1, 2048, 32, 32, 32, 2048] + - [172, 1441.0] + - - [128, 500, 1, 2000, 128, 128, 128, 2000] + - [144, 3794.0] + - - [8, 1024, 1, 10, 8, 8, 8, 10] + - [130, 49.0] + - - [2, 500, 1, 100, 2, 2, 2, 100] + - [135, 33.0] + - - [10, 500, 1, 512, 10, 10, 10, 512] + - [172, 354.0] + - - [32, 500, 1, 500, 32, 32, 32, 500] + - [149, 1127.0] + - - [100, 500, 1, 100, 100, 100, 100, 100] + - [133, 1553.0] + - - [10, 1024, 1, 512, 10, 10, 10, 512] + - [141, 707.0] + - - [512, 100, 1, 512, 512, 512, 512, 512] + - [144, 2675.0] + - - [4, 500, 1, 500, 4, 4, 4, 500] + - [149, 141.0] + - - [64, 100, 1, 1024, 64, 64, 64, 1024] + - [172, 528.0] + - - [2, 500, 1, 2000, 2, 2, 2, 2000] + - [128, 88.0] + - - [32, 512, 1, 2048, 32, 32, 32, 2048] + - [172, 1487.0] + - - [10, 100, 1, 2000, 10, 10, 10, 2000] + - [128, 89.0] + - - [4, 100, 1, 512, 4, 4, 4, 512] + - [141, 28.0] + - - [2, 512, 1, 2048, 2, 2, 2, 2048] + - [141, 92.0] + - - [100, 100, 1, 2000, 100, 100, 100, 2000] + - [128, 890.0] + - - [10, 500, 1, 500, 10, 10, 10, 500] + - [128, 350.0] + - - [2, 100, 1, 2048, 2, 2, 2, 2048] + - [128, 18.0] + - - [32, 100, 1, 2048, 32, 32, 32, 2048] + - [172, 289.0] + - - [16, 100, 1, 1024, 16, 16, 16, 1024] + - [141, 132.0] + - - [2, 500, 1, 10, 2, 2, 2, 10] + - [121, 6.0] + - - [500, 100, 1, 2048, 500, 500, 500, 2048] + - [144, 2901.0] + - - [16, 1024, 1, 2000, 16, 16, 16, 2000] + - [149, 1354.0] + - - [10, 1024, 1, 1024, 10, 10, 10, 1024] + - [158, 805.0] + - - [500, 100, 1, 512, 500, 500, 500, 512] + - [153, 2586.0] + - - [32, 512, 1, 500, 32, 32, 32, 500] + - [165, 1144.0] + - - [100, 500, 1, 512, 100, 100, 100, 512] + - [153, 2596.0] + - - [8, 500, 1, 2000, 8, 8, 8, 2000] + - [149, 351.0] + - - [4, 100, 1, 1024, 4, 4, 4, 1024] + - [128, 33.0] + - - [2, 500, 1, 1024, 2, 2, 2, 1024] + - [141, 83.0] + - - [100, 500, 1, 1024, 100, 100, 100, 1024] + - [153, 2807.0] + - - [32, 100, 1, 1024, 32, 32, 32, 1024] + - [128, 265.0] + - - [64, 100, 1, 2000, 64, 64, 64, 2000] + - [172, 570.0] + - - [64, 500, 1, 10, 64, 64, 64, 10] + - [130, 198.0] + - - [64, 500, 1, 512, 64, 64, 64, 512] + - [158, 2145.0] + - - [10, 100, 1, 1024, 10, 10, 10, 1024] + - [141, 83.0] + - - [16, 512, 1, 100, 16, 16, 16, 100] + - [165, 268.0] + - - [4, 100, 1, 2000, 4, 4, 4, 2000] + - [128, 35.0] + - - [2, 512, 1, 1024, 2, 2, 2, 1024] + - [141, 84.0] + - - [64, 512, 1, 1024, 64, 64, 64, 1024] + - [158, 2597.0] + - - [512, 100, 1, 2048, 512, 512, 512, 2048] + - [144, 3041.0] + - - [32, 100, 1, 2000, 32, 32, 32, 2000] + - [141, 283.0] + - - [4, 512, 1, 500, 4, 4, 4, 500] + - [128, 142.0] + - - [4, 500, 1, 1024, 4, 4, 4, 1024] + - [141, 165.0] + - - [32, 100, 1, 10, 32, 32, 32, 10] + - [130, 20.0] + - - [10, 1024, 1, 2048, 10, 10, 10, 2048] + - [172, 861.0] + - - [8, 500, 1, 100, 8, 8, 8, 100] + - [127, 130.0] + - - [200, 100, 1, 1024, 200, 200, 200, 1024] + - [165, 1603.0] + - - [16, 100, 1, 100, 16, 16, 16, 100] + - [128, 51.0] + - - [8, 1024, 1, 2000, 8, 8, 8, 2000] + - [149, 677.0] + - - [4, 512, 1, 100, 4, 4, 4, 100] + - [128, 67.0] + - - [16, 500, 1, 100, 16, 16, 16, 100] + - [165, 261.0] + - - [8, 1024, 1, 2048, 8, 8, 8, 2048] + - [158, 688.0] + - - [16, 1024, 1, 2048, 16, 16, 16, 2048] + - [158, 1370.0] + - - [64, 512, 1, 100, 64, 64, 64, 100] + - [133, 1092.0] + - - [2, 100, 1, 500, 2, 2, 2, 500] + - [128, 14.0] + - - [2, 500, 1, 512, 2, 2, 2, 512] + - [158, 71.0] + - - [128, 500, 1, 1024, 128, 128, 128, 1024] + - [168, 3605.0] + - - [10, 100, 1, 10, 10, 10, 10, 10] + - [121, 6.0] + - - [64, 1024, 1, 10, 64, 64, 64, 10] + - [121, 400.0] + - - [500, 100, 1, 500, 500, 500, 500, 500] + - [153, 2593.0] + - - [2, 512, 1, 100, 2, 2, 2, 100] + - [121, 33.0] + - - [16, 100, 1, 500, 16, 16, 16, 500] + - [141, 111.0] + - - [128, 100, 1, 500, 128, 128, 128, 500] + - [158, 909.0] + - - [512, 100, 1, 1024, 512, 512, 512, 1024] + - [162, 2887.0] + - - [16, 100, 1, 2000, 16, 16, 16, 2000] + - [128, 141.0] + - - [10, 512, 1, 100, 10, 10, 10, 100] + - [149, 166.0] + - - [8, 512, 1, 100, 8, 8, 8, 100] + - [128, 134.0] + - - [128, 100, 1, 2000, 128, 128, 128, 2000] + - [128, 1148.0] + - - [2, 1024, 1, 2000, 2, 2, 2, 2000] + - [149, 170.0] + - - [100, 512, 1, 512, 100, 100, 100, 512] + - [153, 2653.0] + - - [32, 1024, 1, 2000, 32, 32, 32, 2000] + - [149, 2717.0] + - - [128, 500, 1, 100, 128, 128, 128, 100] + - [125, 1963.0] + - - [100, 100, 1, 100, 100, 100, 100, 100] + - [133, 331.0] + - - [8, 512, 1, 1024, 8, 8, 8, 1024] + - [158, 337.0] + - - [200, 100, 1, 500, 200, 200, 200, 500] + - [149, 1389.0] + - - [2, 1024, 1, 2048, 2, 2, 2, 2048] + - [141, 171.0] + - - [512, 100, 1, 2000, 512, 512, 512, 2000] + - [162, 3022.0] + - - [16, 512, 1, 2000, 16, 16, 16, 2000] + - [128, 719.0] + - - [64, 500, 1, 1024, 64, 64, 64, 1024] + - [158, 2490.0] + - - [10, 512, 1, 1024, 10, 10, 10, 1024] + - [141, 423.0] + - - [512, 100, 1, 100, 512, 512, 512, 100] + - [162, 1580.0] + - - [8, 100, 1, 1024, 8, 8, 8, 1024] + - [141, 66.0] + - - [10, 100, 1, 100, 10, 10, 10, 100] + - [128, 32.0] + - - [10, 500, 1, 2000, 10, 10, 10, 2000] + - [172, 439.0] + - - [500, 100, 1, 2000, 500, 500, 500, 2000] + - [162, 2971.0] + - - [100, 512, 1, 2000, 100, 100, 100, 2000] + - [168, 2982.0] + - - [64, 1024, 1, 512, 64, 64, 64, 512] + - [135, 3403.0] + - - [32, 500, 1, 100, 32, 32, 32, 100] + - [133, 537.0] + - - [10, 100, 1, 2048, 10, 10, 10, 2048] + - [141, 90.0] + - - [64, 100, 1, 100, 64, 64, 64, 100] + - [133, 215.0] + - - [2, 1024, 1, 100, 2, 2, 2, 100] + - [121, 66.0] + - - [64, 500, 1, 2000, 64, 64, 64, 2000] + - [149, 2669.0] + - - [8, 512, 1, 512, 8, 8, 8, 512] + - [141, 286.0] + - - [8, 512, 1, 2048, 8, 8, 8, 2048] + - [158, 369.0] + - - [100, 100, 1, 1024, 100, 100, 100, 1024] + - [172, 834.0] + - - [8, 100, 1, 2000, 8, 8, 8, 2000] + - [141, 71.0] + - - [2, 1024, 1, 1024, 2, 2, 2, 1024] + - [172, 161.0] + - - [16, 512, 1, 512, 16, 16, 16, 512] + - [149, 575.0] + - - [32, 500, 1, 512, 32, 32, 32, 512] + - [141, 1113.0] + - - [32, 500, 1, 1024, 32, 32, 32, 1024] + - [172, 1321.0] + - - [32, 500, 1, 10, 32, 32, 32, 10] + - [130, 99.0] + - - [4, 1024, 1, 500, 4, 4, 4, 500] + - [165, 275.0] + - - [256, 100, 1, 512, 256, 256, 256, 512] + - [149, 1800.0] + - - [8, 1024, 1, 500, 8, 8, 8, 500] + - [149, 549.0] + - - [4, 1024, 1, 100, 4, 4, 4, 100] + - [128, 131.0] + - - [100, 500, 1, 500, 100, 100, 100, 500] + - [135, 2541.0] + - - [2, 1024, 1, 500, 2, 2, 2, 500] + - [128, 137.0] + - - [64, 100, 1, 500, 64, 64, 64, 500] + - [141, 451.0] + - - [2, 512, 1, 500, 2, 2, 2, 500] + - [128, 71.0] + - - [10, 1024, 1, 500, 10, 10, 10, 500] + - [128, 688.0] + - - [128, 500, 1, 512, 128, 128, 128, 512] + - [168, 3303.0] + - - [10, 500, 1, 2048, 10, 10, 10, 2048] + - [158, 451.0] + - - [128, 512, 1, 512, 128, 128, 128, 512] + - [153, 3396.0] + - - [64, 512, 1, 10, 64, 64, 64, 10] + - [130, 210.0] + - - [32, 500, 1, 2000, 32, 32, 32, 2000] + - [172, 1410.0] + - - [100, 100, 1, 2048, 100, 100, 100, 2048] + - [172, 906.0] + - - [200, 100, 1, 512, 200, 200, 200, 512] + - [149, 1410.0] + - - [200, 100, 1, 2048, 200, 200, 200, 2048] + - [149, 1728.0] + - - [8, 100, 1, 10, 8, 8, 8, 10] + - [121, 5.0] + - - [100, 100, 1, 500, 100, 100, 100, 500] + - [158, 708.0] + - - [100, 500, 1, 10, 100, 100, 100, 10] + - [130, 291.0] + - - [10, 500, 1, 1024, 10, 10, 10, 1024] + - [141, 414.0] + - - [256, 100, 1, 10, 256, 256, 256, 10] + - [130, 158.0] + - - [10, 512, 1, 2048, 10, 10, 10, 2048] + - [158, 461.0] + - - [2, 1024, 1, 512, 2, 2, 2, 512] + - [158, 143.0] + - - [4, 500, 1, 2048, 4, 4, 4, 2048] + - [172, 181.0] + - - [100, 512, 1, 100, 100, 100, 100, 100] + - [133, 1590.0] + - - [16, 500, 1, 512, 16, 16, 16, 512] + - [141, 564.0] + - - [10, 1024, 1, 100, 10, 10, 10, 100] + - [149, 328.0] + - - [8, 1024, 1, 100, 8, 8, 8, 100] + - [128, 264.0] + - - [64, 1024, 1, 2000, 64, 64, 64, 2000] + - [123, 3887.0] + - - [10, 100, 1, 512, 10, 10, 10, 512] + - [141, 70.0] + - - [4, 500, 1, 2000, 4, 4, 4, 2000] + - [149, 176.0] + - - [4, 100, 1, 100, 4, 4, 4, 100] + - [121, 13.0] + - - [32, 1024, 1, 512, 32, 32, 32, 512] + - [141, 2292.0] + - - [8, 512, 1, 2000, 8, 8, 8, 2000] + - [128, 359.0] + - - [100, 100, 1, 512, 100, 100, 100, 512] + - [158, 709.0] + - - [2, 512, 1, 2000, 2, 2, 2, 2000] + - [128, 90.0] + - - [16, 500, 1, 10, 16, 16, 16, 10] + - [127, 49.0] + - - [10, 500, 1, 100, 10, 10, 10, 100] + - [149, 161.0] + - - [4, 100, 1, 500, 4, 4, 4, 500] + - [128, 28.0] + - - [64, 500, 1, 100, 64, 64, 64, 100] + - [133, 1053.0] + - - [2, 100, 1, 100, 2, 2, 2, 100] + - [121, 7.0] + - - [10, 512, 1, 2000, 10, 10, 10, 2000] + - [149, 450.0] + - - [8, 500, 1, 500, 8, 8, 8, 500] + - [128, 279.0] + - - [4, 500, 1, 512, 4, 4, 4, 512] + - [158, 141.0] + - - [10, 500, 1, 10, 10, 10, 10, 10] + - [127, 30.0] + - - [64, 512, 1, 2000, 64, 64, 64, 2000] + - [149, 2772.0] + - - [32, 512, 1, 2000, 32, 32, 32, 2000] + - [128, 1444.0] + - - [128, 500, 1, 2048, 128, 128, 128, 2048] + - [135, 3703.0] + - - [4, 512, 1, 512, 4, 4, 4, 512] + - [141, 143.0] + - - [16, 500, 1, 1024, 16, 16, 16, 1024] + - [141, 661.0] + - - [10, 1024, 1, 10, 10, 10, 10, 10] + - [125, 62.0] + - - [16, 500, 1, 500, 16, 16, 16, 500] + - [149, 554.0] + - - [500, 100, 1, 1024, 500, 500, 500, 1024] + - [135, 2786.0] + - - [16, 100, 1, 512, 16, 16, 16, 512] + - [141, 113.0] + - - [64, 512, 1, 2048, 64, 64, 64, 2048] + - [158, 2764.0] + - - [32, 1024, 1, 10, 32, 32, 32, 10] + - [130, 205.0] + - - [8, 1024, 1, 512, 8, 8, 8, 512] + - [158, 565.0] + - - [4, 1024, 1, 2048, 4, 4, 4, 2048] + - [172, 345.0] + - - [128, 500, 1, 500, 128, 128, 128, 500] + - [123, 3320.0] + - - [100, 512, 1, 1024, 100, 100, 100, 1024] + - [168, 2871.0] + - - [16, 1024, 1, 500, 16, 16, 16, 500] + - [149, 1098.0] + - - [128, 100, 1, 2048, 128, 128, 128, 2048] + - [128, 1169.0] + - - [100, 512, 1, 500, 100, 100, 100, 500] + - [153, 2602.0] + - - [8, 1024, 1, 1024, 8, 8, 8, 1024] + - [158, 643.0] + - - [4, 500, 1, 10, 4, 4, 4, 10] + - [123, 12.0] + - - [128, 500, 1, 10, 128, 128, 128, 10] + - [130, 376.0] + - - [32, 1024, 1, 100, 32, 32, 32, 100] + - [133, 1085.0] + - - [8, 500, 1, 2048, 8, 8, 8, 2048] + - [172, 361.0] + - - [16, 1024, 1, 1024, 16, 16, 16, 1024] + - [172, 1285.0] + - - [200, 100, 1, 10, 200, 200, 200, 10] + - [130, 123.0] + - - [512, 100, 1, 500, 512, 512, 512, 500] + - [144, 2672.0] + - - [4, 500, 1, 100, 4, 4, 4, 100] + - [127, 65.0] + - - [8, 100, 1, 2048, 8, 8, 8, 2048] + - [141, 72.0] + - - [512, 100, 1, 10, 512, 512, 512, 10] + - [130, 305.0] + - - [4, 512, 1, 1024, 4, 4, 4, 1024] + - [158, 168.0] + - - [32, 1024, 1, 2048, 32, 32, 32, 2048] + - [172, 2755.0] + - - [128, 100, 1, 512, 128, 128, 128, 512] + - [128, 910.0] + - - [32, 1024, 1, 500, 32, 32, 32, 500] + - [149, 2202.0] + - - [4, 1024, 1, 10, 4, 4, 4, 10] + - [130, 26.0] + - - [100, 512, 1, 10, 100, 100, 100, 10] + - [125, 298.0] + - - [8, 100, 1, 100, 8, 8, 8, 100] + - [128, 26.0] + - - [128, 512, 1, 500, 128, 128, 128, 500] + - [123, 3420.0] + - - [16, 100, 1, 2048, 16, 16, 16, 2048] + - [141, 145.0] + - - [2, 1024, 1, 10, 2, 2, 2, 10] + - [121, 12.0] + - - [4, 100, 1, 2048, 4, 4, 4, 2048] + - [128, 36.0] + - - [4, 512, 1, 2000, 4, 4, 4, 2000] + - [158, 180.0] + - - [1024, 29, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 2297.0] + - - [1024, 1, 1, 21, 1024, 1024, 1024, 21] + - [121, 11.0] + - - [1024, 49, 1, 1024, 1024, 1024, 1024, 1024] + - [162, 2751.0] + - - [1024, 35, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 2414.0] + - - [1024, 24, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 1907.0] + - - [1024, 21, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 1668.0] + - - [1024, 1, 1, 14, 1024, 1024, 1024, 14] + - [130, 9.0] + - - [1024, 91, 1, 1024, 1024, 1024, 1024, 1024] + - [124, 4071.0] + - - [1024, 14, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 1158.0] + - - [1024, 25, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 1983.0] + - - [1024, 27, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 2142.0] + - - [1024, 50, 1, 1024, 1024, 1024, 1024, 1024] + - [162, 2801.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [123, 3577.0] + - - [1024, 13, 1, 1024, 1024, 1024, 1024, 1024] + - [165, 1077.0] + - - [1024, 63, 1, 1024, 1024, 1024, 1024, 1024] + - [162, 3518.0] + - - [1024, 86, 1, 1024, 1024, 1024, 1024, 1024] + - [124, 3854.0] + - - [1024, 1, 1, 13, 1024, 1024, 1024, 13] + - [127, 8.0] + - - [289, 192, 1, 1344, 289, 289, 289, 1344] + - [162, 3206.0] + - - [196, 128, 1, 800, 196, 196, 196, 800] + - [149, 1952.0] + - - [64, 512, 1, 1344, 64, 64, 64, 1344] + - [128, 2692.0] + - - [289, 224, 1, 1568, 289, 289, 289, 1568] + - [123, 3672.0] + - - [64, 256, 1, 1536, 64, 64, 64, 1536] + - [158, 1451.0] + - - [289, 160, 1, 1120, 289, 289, 289, 1120] + - [149, 3116.0] + - - [64, 256, 1, 1152, 64, 64, 64, 1152] + - [172, 1406.0] + - - [289, 224, 1, 1344, 289, 289, 289, 1344] + - [162, 3631.0] + - - [289, 192, 1, 896, 289, 289, 289, 896] + - [162, 3103.0] + - - [784, 16, 32, 192, 784, 784, 784, 192] + - [123, 3956.0] + - - [49, 128, 1, 1200, 49, 49, 49, 1200] + - [149, 523.0] + - - [289, 128, 1, 896, 289, 289, 289, 896] + - [149, 2441.0] + - - [1001, 32, 1, 1024, 1001, 1001, 1001, 1024] + - [128, 2555.0] + - - [64, 448, 1, 1152, 64, 64, 64, 1152] + - [149, 2333.0] + - - [1001, 32, 1, 2048, 1001, 1001, 1001, 2048] + - [128, 2747.0] + - - [289, 192, 1, 1120, 289, 289, 289, 1120] + - [162, 3164.0] + - - [64, 320, 1, 1728, 64, 64, 64, 1728] + - [128, 1762.0] + - - [289, 96, 1, 864, 289, 289, 289, 864] + - [149, 2156.0] + - - [196, 64, 1, 800, 196, 196, 196, 800] + - [141, 992.0] + - - [784, 32, 1, 400, 784, 784, 784, 400] + - [128, 1629.0] + - - [64, 320, 1, 2880, 64, 64, 64, 2880] + - [128, 1833.0] + - - [1001, 32, 1, 1536, 1001, 1001, 1001, 1536] + - [128, 2692.0] + - - [64, 384, 1, 1152, 64, 64, 64, 1152] + - [128, 2002.0] + - - [64, 192, 1, 1728, 64, 64, 64, 1728] + - [149, 1084.0] + - - [1001, 64, 1, 1536, 1001, 1001, 1001, 1536] + - [144, 3594.0] + - - [1001, 64, 1, 2048, 1001, 1001, 1001, 2048] + - [123, 3669.0] + - - [1024, 64, 1, 4096, 1024, 1024, 1024, 4096] + - [153, 3820.0] + - - [64, 10, 448, 10, 64, 64, 64, 10] + - [123, 969.0] + - - [64, 18, 648, 18, 64, 64, 64, 18] + - [169, 2687.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [145, 3625.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [124, 3647.0] + - - [64, 21, 1472, 21, 64, 64, 64, 21] + - [169, 4033.0] + - - [64, 23, 64, 23, 64, 64, 64, 23] + - [162, 985.0] + - - [64, 26, 56, 26, 64, 64, 64, 26] + - [144, 1111.0] + - - [1024, 1, 1, 2, 1024, 1024, 1024, 2] + - [121, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 83.0] + - - [64, 27, 56, 26, 64, 64, 64, 26] + - [162, 1176.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [121, 10.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [121, 27.0] + - - [64, 31, 1, 30, 64, 64, 64, 30] + - [125, 29.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [131, 30.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [172, 9.0] + - - [64, 14, 1, 15, 64, 64, 64, 15] + - [127, 8.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [127, 9.0] + - - [64, 15, 1, 17, 64, 64, 64, 17] + - [121, 9.0] + - - [100, 512, 1, 2048, 100, 100, 100, 2048] + - [135, 2989.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1600] + - [128, 87.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 200] + - [128, 49.0] + - - [1, 200, 1, 1, 1, 1, 1, 1] + - [121, 0.13] + - - [1, 512, 1, 1, 1, 1, 1, 1] + - [162, 0.35] + - - [67, 512, 1, 2048, 67, 67, 67, 2048] + - [158, 2423.0] + - - [74, 512, 1, 2048, 74, 74, 74, 2048] + - [172, 2676.0] + - - [64, 3, 512, 3, 64, 64, 64, 3] + - [169, 128.0] + - - [64, 5, 512, 5, 64, 64, 64, 5] + - [121, 323.0] + - - [64, 9, 512, 9, 64, 64, 64, 9] + - [123, 897.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [158, 2311.0] + - - [25, 128, 120, 256, 25, 25, 25, 256] + - [157, 3715.0] + - - [25, 128, 139, 256, 25, 25, 25, 256] + - [134, 3987.0] + - - [25, 128, 160, 256, 25, 25, 25, 256] + - [140, 3860.0] + - - [25, 128, 18, 256, 25, 25, 25, 256] + - [153, 2534.0] + - - [25, 128, 19, 256, 25, 25, 25, 256] + - [135, 2323.0] + - - [9, 128, 120, 256, 9, 9, 9, 256] + - [171, 1725.0] + - - [9, 128, 139, 256, 9, 9, 9, 256] + - [139, 1744.0] + - - [9, 128, 160, 256, 9, 9, 9, 256] + - [139, 1801.0] + - - [9, 128, 18, 256, 9, 9, 9, 256] + - [133, 1021.0] + - - [9, 128, 19, 256, 9, 9, 9, 256] + - [133, 1073.0] + - - [1, 256, 1, 1152, 1, 1, 1, 1152] + - [141, 22.0] + - - [100, 512, 1, 2304, 100, 100, 100, 2304] + - [135, 3009.0] + - - [25, 256, 1, 1152, 25, 25, 25, 1152] + - [141, 538.0] + - - [9, 256, 1, 1152, 9, 9, 9, 1152] + - [141, 194.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [123, 3601.0] + - - [1024, 10, 1, 2, 1024, 1024, 1024, 2] + - [162, 14.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [165, 834.0] + - - [1024, 39, 1, 2, 1024, 1024, 1024, 2] + - [144, 53.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 2690.0] + - - [1024, 40, 1, 2, 1024, 1024, 1024, 2] + - [162, 55.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 2763.0] + - - [1024, 41, 1, 2, 1024, 1024, 1024, 2] + - [162, 56.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 2817.0] + - - [1024, 5, 1, 2, 1024, 1024, 1024, 2] + - [121, 7.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 418.0] + - - [1024, 8, 1, 2, 1024, 1024, 1024, 2] + - [121, 11.0] + - - [1024, 9, 1, 2, 1024, 1024, 1024, 2] + - [162, 13.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 751.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [124, 612.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [124, 615.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [162, 3697.0] + - - [64, 14, 10880, 15, 64, 64, 64, 15] + - [162, 3767.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [162, 3941.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [162, 4015.0] + - - [64, 15, 7680, 17, 64, 64, 64, 17] + - [162, 3873.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [124, 3962.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [145, 4035.0] + - - [64, 17, 6144, 21, 64, 64, 64, 21] + - [163, 4298.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [163, 5239.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [154, 5831.0] + - - [64, 24, 4736, 34, 64, 64, 64, 34] + - [136, 5992.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [136, 6335.0] + - - [64, 31, 2048, 30, 64, 64, 64, 30] + - [136, 6349.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [163, 6342.0] + - - [128, 128, 1, 64, 128, 128, 128, 64] + - [133, 460.0] + - - [64, 5, 1, 5, 64, 64, 64, 5] + - [121, 1.0] + - - [32, 33, 1, 33, 32, 32, 32, 33] + - [125, 18.0] + - - [64, 5, 960, 5, 64, 64, 64, 5] + - [121, 449.0] + - - [74, 960, 1, 2048, 74, 74, 74, 2048] + - [169, 3354.0] + - - [128, 27, 32768, 27, 128, 128, 128, 27] + - [143, 2348.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [172, 1269.0] + - - [1024, 16, 1, 2, 1024, 1024, 1024, 2] + - [130, 22.0] + - - [1024, 64, 1, 2, 1024, 1024, 1024, 2] + - [144, 85.0] + - - [1024, 80, 1, 2, 1024, 1024, 1024, 2] + - [144, 105.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [124, 3631.0] + - - [1024, 82, 1, 2, 1024, 1024, 1024, 2] + - [144, 109.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [149, 992.0] + - - [1024, 12, 1, 2, 1024, 1024, 1024, 2] + - [121, 16.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [169, 5974.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [136, 6352.0] + - - [196, 256, 1, 2304, 196, 196, 196, 2304] + - [127, 3042.0] + - - [850, 3, 2, 256, 850, 850, 850, 256] + - [149, 284.0] + - - [850, 12, 2, 256, 850, 850, 850, 256] + - [149, 1116.0] + - - [805, 12, 2, 256, 805, 805, 805, 256] + - [149, 1057.0] + - - [805, 3, 2, 256, 805, 805, 805, 256] + - [149, 266.0] + - - [768, 3, 2, 256, 768, 768, 768, 256] + - [149, 251.0] + - - [768, 12, 2, 256, 768, 768, 768, 256] + - [149, 1008.0] + - - [864, 12, 2, 256, 864, 864, 864, 256] + - [149, 1134.0] + - - [864, 3, 2, 256, 864, 864, 864, 256] + - [149, 286.0] + - - [247, 3, 2, 256, 247, 247, 247, 256] + - [149, 82.0] + - - [216, 3, 2, 256, 216, 216, 216, 256] + - [127, 71.0] + - - [950, 3, 2, 256, 950, 950, 950, 256] + - [149, 313.0] + - - [187, 12, 2, 256, 187, 187, 187, 256] + - [149, 247.0] + - - [176, 12, 2, 256, 176, 176, 176, 256] + - [149, 234.0] + - - [247, 12, 2, 256, 247, 247, 247, 256] + - [128, 324.0] + - - [187, 3, 2, 256, 187, 187, 187, 256] + - [149, 62.0] + - - [228, 12, 2, 256, 228, 228, 228, 256] + - [149, 299.0] + - - [221, 12, 2, 256, 221, 221, 221, 256] + - [128, 290.0] + - - [176, 3, 2, 256, 176, 176, 176, 256] + - [149, 59.0] + - - [950, 12, 2, 256, 950, 950, 950, 256] + - [149, 1247.0] + - - [192, 12, 2, 256, 192, 192, 192, 256] + - [149, 254.0] + - - [228, 3, 2, 256, 228, 228, 228, 256] + - [128, 75.0] + - - [221, 3, 2, 256, 221, 221, 221, 256] + - [127, 73.0] + - - [192, 3, 2, 256, 192, 192, 192, 256] + - [149, 64.0] + - - [216, 12, 2, 256, 216, 216, 216, 256] + - [128, 285.0] + - - [2, 6, 1, 1024, 2, 2, 2, 1024] + - [121, 1.0] + - - [1024, 20, 1, 2, 1024, 1024, 1024, 2] + - [121, 28.0] +- null diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB.yaml index cc848113a..1a06aa243 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB.yaml @@ -1,7 +1,7 @@ -- {MinimumRequiredVersion: 4.33.0} +- {MinimumRequiredVersion: 4.26.0} - navi31 - gfx1100 -- [Device 6863] +- [Device 744c] - AllowNoFreeDims: false AssignedDerivedParameters: true Batched: true @@ -11,7 +11,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -55,16 +54,29314 @@ ZeroPadB: [] - - 1LDSBuffer: 0 AggressivePerfMode: 1 - AssertAlphaValue: false - AssertBetaValue: false - AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 0 AssertSizeEqual: {} - AssertSizeGreaterThan: {} - AssertSizeLessThan: {} - AssertSizeMultiple: {} AssertStrideAEqual: {0: 1} AssertStrideBEqual: {0: 1} AssertStrideCEqual: {0: 1} @@ -72,27 +29369,22 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - AtomicAddC: false BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false CodeObjectVersion: default - CustomKernelName: '' - DepthU: 8 + DepthU: 16 DepthULdsDivisor: 1 DirectToLds: false DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false DisableAtomicFail: 0 DisableKernelPieces: 0 DisableVgprOverlapping: false EdgeType: ShiftPtr EnableMatrixInstruction: false - ExpandPointerSwap: 0 - Fp16AltImpl: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -102,36 +29394,259 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadPerMfma: 1 GlobalReadVectorWidth: 1 GlobalSplitU: 1 - GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 - GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [11, 0, 0] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 + LSPA: 4 + LSPB: 2 + LVCA: 16 LVCB: 32 - LVPA: 32 - LVPB: 8 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false LdsBlockSizePerPad: 0 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 - LdsInitCVgprs: false - LdsNumElements: 512 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -141,18 +29656,16 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false - LoopIters: 8 + LoopIters: 32 LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MACInstruction: FMA - MIArchVgpr: false - MacroTile0: 32 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -161,23 +29674,18 @@ MaxOccupancy: 40 MaxVgprNumber: 256 MinVgprNumber: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 - NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 OptNoLoadLoop: 1 OptPreLoopVmcnt: 0 PackBatchDims: 0 @@ -194,8 +29702,7 @@ PersistentKernel: 0 PersistentKernelAlongBatch: false PrefetchAcrossPersistent: 0 - PrefetchAcrossPersistentMode: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AllowNoFreeDims: false @@ -207,7 +29714,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -253,30 +29759,23 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_ - SourceSwap: false - StaggerU: 32 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StoreCInUnroll: false - StoreCInUnrollExact: false - StoreCInUnrollInterval: 1 - StoreCInUnrollPostLoop: false - StorePriorityOpt: false + StaggerUStride: 128 StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 TransposeLDS: 0 UnrollIncIsDepthU: 0 UnrollMajorLDSA: 0 @@ -291,20 +29790,2123 @@ VectorWidth: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WavefrontSize: 32 + WorkGroup: [8, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _DepthULds: 8 + _DepthULds: 32 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 3 - allowLRVWforTLUandMI: false + _staggerStrideShift: 0 - [2, 3, 0, 1] -- - - [126, 126, 2, 66, 126, 126, 66, 126] - - [0, 0] -- null +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [16, 19791.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [16, 18653.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [16, 18714.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [34, 18995.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [31, 15694.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [16, 20187.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [19, 17089.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 5056] + - [6, 17036.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [28, 19959.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [16, 19010.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [16, 17763.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [28, 19080.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [6, 15327.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [28, 16931.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [16, 19839.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [6, 14226.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 1408] + - [14, 14671.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [32, 18194.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [28, 20095.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [30, 13955.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [16, 18635.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [16, 19853.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [3, 16912.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [9, 15557.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [14, 14773.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [3, 18344.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [28, 17869.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [22, 15034.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [28, 16743.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [16, 20423.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [10, 19227.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 6784] + - [13, 18006.0] + - - [704, 5056, 1, 128, 704, 704, 128, 5056] + - [26, 14879.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [28, 18319.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [16, 20512.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [16, 18577.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [16, 19789.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [9, 15216.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [28, 19972.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [16, 19530.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [16, 16551.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 2944] + - [26, 16216.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [7, 19017.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [16, 17324.0] + - - [448, 5888, 1, 128, 448, 448, 128, 5888] + - [6, 13582.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [28, 18880.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [30, 13981.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [28, 20100.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 2944] + - [25, 18636.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [6, 15290.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [31, 17552.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 6784] + - [13, 17683.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [28, 17138.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [33, 15667.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [10, 18153.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [10, 18256.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [31, 16265.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [28, 20548.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [10, 16622.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [32, 14131.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [30, 13776.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [16, 18363.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [28, 18273.0] + - - [704, 2944, 1, 128, 704, 704, 128, 2944] + - [2, 14171.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [33, 15650.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [7, 16721.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [10, 16558.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 1408] + - [1, 17019.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [16, 19862.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [28, 18866.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [34, 19563.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 4288] + - [6, 16939.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [28, 19395.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [16, 19916.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [16, 20316.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [10, 19836.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [16, 19535.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [32, 16868.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [17, 20348.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [28, 18819.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [16, 19868.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [34, 18725.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 5888] + - [31, 18974.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [17, 17458.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 3584] + - [1, 17612.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [16, 20674.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [28, 19153.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [8, 17172.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 2368] + - [30, 16990.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [15, 15138.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [5, 17792.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [34, 19497.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [34, 16059.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [28, 18877.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [16, 16617.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [22, 16595.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [16, 19433.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [31, 17484.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 448] + - [30, 14234.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [3, 18065.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [16, 20579.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [16, 19173.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [5, 15019.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [3, 16792.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [28, 20065.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 5888] + - [31, 18715.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 3584] + - [26, 17133.0] + - - [448, 3584, 1, 128, 448, 448, 128, 3584] + - [0, 12190.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [28, 20090.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 5888] + - [31, 17811.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [28, 19955.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 704] + - [6, 13471.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [28, 20196.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 2368] + - [26, 17309.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 704] + - [30, 15382.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [10, 18869.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [28, 19893.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [28, 20545.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [29, 20041.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [32, 15008.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 5888] + - [0, 17173.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [28, 19900.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [28, 20050.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [7, 18731.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [28, 18226.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [3, 18676.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [2, 18403.0] + - - [256, 5056, 1, 128, 256, 256, 128, 5056] + - [24, 12984.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [10, 18077.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [16, 19112.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [32, 15522.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 1408] + - [8, 16662.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [16, 19310.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [33, 14229.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [3, 18746.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 2368] + - [7, 18094.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [10, 19085.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [17, 15497.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [16, 20017.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [16, 16312.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 2944] + - [34, 19218.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 1856] + - [16, 17351.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 1024] + - [18, 13943.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 3584] + - [3, 18485.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [28, 20653.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [28, 18467.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [28, 17459.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [16, 20122.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [30, 14310.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [33, 14861.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [18, 14222.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [28, 19501.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 704] + - [18, 14343.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [16, 19051.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 5888] + - [28, 19221.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [32, 16778.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [16, 19155.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [28, 19001.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [10, 18707.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 4288] + - [28, 18595.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 1856] + - [27, 14933.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [28, 19185.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [28, 20026.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 2368] + - [26, 17371.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 4288] + - [31, 16548.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [3, 17597.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [28, 18868.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 6784] + - [16, 19215.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [10, 19500.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [30, 14494.0] + - - [448, 4288, 1, 128, 448, 448, 128, 4288] + - [24, 12887.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [17, 19951.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [35, 16602.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 4288] + - [30, 16800.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [29, 17370.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [10, 19634.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [16, 19209.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [28, 18383.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [28, 18108.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [28, 19876.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 2368] + - [19, 17570.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [28, 19970.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [22, 19762.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 3584] + - [26, 16820.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [19, 15232.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 6784] + - [28, 19266.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [19, 18183.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [6, 15119.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [3, 19386.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [17, 20277.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 5888] + - [34, 19339.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 5888] + - [19, 18845.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [28, 18969.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [28, 18822.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [16, 18990.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [16, 20031.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [31, 16767.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 704] + - [30, 15780.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [22, 18474.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [5, 16740.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 2368] + - [2, 15955.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [16, 16265.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 6784] + - [28, 19282.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [33, 15027.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [5, 14013.0] + - - [256, 5888, 1, 128, 256, 256, 128, 5888] + - [26, 13492.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [28, 19891.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [34, 18760.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [16, 18059.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 4288] + - [31, 18567.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [17, 17767.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 704] + - [30, 14966.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 5056] + - [19, 18459.0] + - - [448, 5056, 1, 128, 448, 448, 128, 5056] + - [0, 12909.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 5056] + - [31, 17173.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 3584] + - [16, 18727.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [28, 18428.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [16, 20061.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 5056] + - [1, 17215.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [29, 20409.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [28, 20051.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [28, 19963.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [31, 16483.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [12, 15155.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [17, 20278.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [16, 20601.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [28, 20043.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 3584] + - [31, 18900.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 1856] + - [6, 14766.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [34, 17760.0] + - - [704, 3584, 1, 128, 704, 704, 128, 3584] + - [32, 14054.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [9, 15236.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [16, 19385.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 2944] + - [10, 18119.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [17, 19935.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [28, 19986.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [28, 19305.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [29, 16650.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 3584] + - [34, 18684.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [33, 15227.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [16, 19138.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [13, 16452.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [16, 20457.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [31, 18287.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [29, 15945.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 4288] + - [31, 18836.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [16, 17722.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [34, 18843.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [3, 17822.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [3, 19135.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 3584] + - [0, 16199.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 1408] + - [6, 14395.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 2944] + - [1, 16723.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 2944] + - [19, 18469.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [16, 20147.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 2368] + - [14, 15496.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 2368] + - [25, 17922.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [16, 20252.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [7, 18019.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [16, 19535.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [28, 18545.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 5056] + - [22, 18353.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [28, 18586.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [29, 19894.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [7, 15648.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [16, 20008.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [28, 18374.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 448] + - [30, 13665.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [16, 17639.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [28, 18125.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [1, 16883.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [17, 20517.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [28, 19846.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [16, 18033.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [28, 19868.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [3, 18984.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 1408] + - [26, 16186.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [28, 20424.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [34, 18904.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [3, 19105.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [10, 19406.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [10, 17742.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [31, 16664.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [28, 20018.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [16, 18164.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [36, 17155.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [28, 17519.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [3, 18657.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 2944] + - [16, 18259.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [17, 20444.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [28, 18981.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [16, 19237.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [34, 16072.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [29, 19067.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [34, 18012.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [28, 16692.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [33, 15871.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [28, 19055.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [28, 15795.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 2368] + - [0, 15009.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [16, 19539.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [0, 15771.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [10, 19653.0] + - - [448, 6784, 1, 128, 448, 448, 128, 6784] + - [6, 14044.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [16, 20008.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [28, 17474.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [17, 18885.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [16, 18995.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 448] + - [6, 13752.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [33, 13880.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [10, 17282.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [5, 18737.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [18, 15817.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [16, 18503.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [5, 15487.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [10, 18281.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [3, 17989.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [32, 16782.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [30, 15492.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 2368] + - [30, 16043.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 1408] + - [31, 18044.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [10, 19522.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [31, 17994.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 3584] + - [1, 18306.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [28, 19702.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 1024] + - [30, 13500.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [8, 15321.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [28, 19383.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [16, 19379.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [10, 15763.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [16, 18632.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [32, 15125.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [32, 15299.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [5, 17285.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 1024] + - [28, 17963.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [16, 16931.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 5056] + - [19, 18473.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [16, 18836.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [16, 19482.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 6784] + - [25, 18734.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [28, 20451.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 1856] + - [31, 16446.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 2944] + - [2, 16480.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 448] + - [30, 15196.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [28, 14768.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 1856] + - [26, 15991.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [32, 15213.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [10, 16010.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 1024] + - [32, 15860.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [3, 16124.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [16, 18868.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [16, 19485.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [32, 14166.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [28, 19765.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 1024] + - [32, 17044.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [29, 16962.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [29, 20246.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [3, 19684.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [10, 17534.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [16, 18589.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 1856] + - [18, 16534.0] + - - [256, 6784, 1, 128, 256, 256, 128, 6784] + - [35, 15331.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 6784] + - [13, 18888.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 5056] + - [13, 18397.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 5888] + - [1, 17433.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [28, 19651.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [28, 17828.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [16, 19762.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [17, 17041.0] + - - [704, 5888, 1, 128, 704, 704, 128, 5888] + - [26, 15424.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 3584] + - [31, 19126.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [17, 20030.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 1408] + - [30, 14402.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [10, 18227.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [28, 16832.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [18, 13132.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [31, 15727.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [29, 17800.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [0, 16005.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [29, 20170.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [34, 19847.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [34, 17290.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [28, 19384.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [7, 15273.0] + - - [704, 4288, 1, 128, 704, 704, 128, 4288] + - [8, 14386.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 4288] + - [26, 17441.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 6784] + - [10, 17855.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [29, 16342.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [29, 16039.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [20, 16842.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 5056] + - [25, 18718.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [18, 13404.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [16, 19445.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [19, 15203.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [16, 17583.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 256] + - [0, 11953.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 5888] + - [7, 17793.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [30, 14612.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 1856] + - [6, 14585.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [28, 20346.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [28, 17137.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [28, 19812.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [13, 18473.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [28, 18178.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [28, 19740.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [28, 18466.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [16, 19401.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [16, 16711.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [16, 18832.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 1024] + - [30, 15325.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [16, 19876.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [28, 20593.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 5056] + - [25, 18882.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [16, 19173.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [30, 13772.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [30, 13962.0] + - - [704, 2368, 1, 128, 704, 704, 128, 2368] + - [12, 12793.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 256] + - [4, 12914.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 1856] + - [26, 16900.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 4288] + - [31, 18347.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [5, 15945.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [28, 17901.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [28, 18561.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [9, 14466.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 6784] + - [28, 19001.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [16, 20350.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [31, 16785.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [16, 20000.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [28, 18511.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 5888] + - [34, 19253.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [34, 19558.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [32, 14356.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [16, 17329.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [3, 19032.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [7, 19126.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [28, 19016.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [21, 15868.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [16, 20428.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [17, 19158.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [6, 16092.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [17, 16813.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [31, 15809.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [16, 18475.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [18, 13470.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [28, 19581.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [29, 20600.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [3, 20151.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [16, 18914.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [33, 16079.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [7, 17608.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 1408] + - [28, 17645.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [28, 19413.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [10, 19173.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [31, 17428.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [19, 17316.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 1024] + - [11, 16863.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 256] + - [35, 15205.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 704] + - [30, 15688.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [17, 18356.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [29, 18983.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [16, 18583.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [28, 19505.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [17, 15909.0] + - - [704, 6784, 1, 128, 704, 704, 128, 6784] + - [20, 15862.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [16, 19435.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [34, 20304.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [28, 19468.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 6784] + - [25, 18281.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [5, 15610.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [22, 19349.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [3, 16753.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 5056] + - [19, 17115.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [16, 20559.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [30, 14736.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 448] + - [6, 14808.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 2944] + - [10, 18547.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [32, 18009.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [3, 19467.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [28, 18200.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 4288] + - [28, 18211.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [29, 20162.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [30, 14057.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 1408] + - [31, 16971.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [16, 19016.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [16, 16764.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [28, 18562.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [10, 16093.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [16, 19073.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 2944] + - [12, 15276.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [3, 17646.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [17, 20188.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [17, 16998.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [28, 19706.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [2, 18085.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [31, 17850.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 1856] + - [10, 17664.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 704] + - [30, 14156.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [2, 15835.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [16, 20070.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 1408] + - [26, 15760.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 1024] + - [30, 13981.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [29, 20366.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [28, 18833.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [16, 18165.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 4288] + - [31, 18139.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [28, 19147.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [32, 16614.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [12, 15285.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 1856] + - [19, 17213.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [10, 18426.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [28, 19893.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [16, 15652.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [16, 20699.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [5, 16516.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [34, 16930.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 1024] + - [2, 16253.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [34, 18886.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [29, 19142.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [34, 20260.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [34, 20664.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [23, 17850.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [23, 20437.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [22, 20170.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [29, 20673.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [12, 15534.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [29, 19423.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [16, 17793.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [32, 19142.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [25, 19228.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [13, 19836.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [28, 19805.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [28, 20194.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [17, 20723.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [3, 20397.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [28, 20382.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [28, 20511.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [34, 20680.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [28, 20589.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [17, 20576.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [28, 20655.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [28, 20756.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [67, 14630.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [72, 13695.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 448] + - [69, 12146.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [41, 17035.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [59, 14428.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [72, 11896.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [37, 10379.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [51, 14043.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [70, 9184.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [61, 7376.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [68, 7844.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [47, 12150.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [49, 12188.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [49, 11844.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [65, 13722.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [45, 7537.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [63, 10266.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [69, 13745.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [69, 11302.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [78, 15257.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [45, 8307.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [62, 11736.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [72, 15338.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 256] + - [46, 10285.0] + - - [448, 2944, 1, 128, 448, 448, 128, 2944] + - [69, 13251.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [68, 8781.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [63, 8786.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [75, 10285.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [61, 9261.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 448] + - [38, 7935.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [72, 14322.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [45, 8903.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [46, 11785.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [71, 9342.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 256] + - [64, 10834.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [52, 13684.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [45, 8537.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [78, 12067.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [39, 10597.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [69, 12848.0] + - - [448, 1856, 1, 128, 448, 448, 128, 1856] + - [38, 11298.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [54, 13048.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [40, 12211.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [44, 13349.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [41, 15015.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [65, 14161.0] + - - [704, 1856, 1, 128, 704, 704, 128, 1856] + - [69, 12965.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [44, 12539.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [39, 10355.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [46, 8938.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [54, 11791.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [57, 12669.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [64, 13978.0] + - - [448, 2368, 1, 128, 448, 448, 128, 2368] + - [38, 12124.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [78, 12973.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [54, 14263.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [39, 8966.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [49, 13415.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [62, 9775.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [49, 14724.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [68, 8947.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [52, 12350.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [50, 14054.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [54, 11577.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [41, 15738.0] + - - [448, 1024, 1, 128, 448, 448, 128, 1024] + - [37, 7978.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [66, 14252.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 64] + - [45, 6595.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [62, 11578.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [57, 13401.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [75, 9654.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [49, 16523.0] + - - [256, 1856, 1, 128, 256, 256, 128, 1856] + - [71, 8196.0] + - - [448, 1408, 1, 128, 448, 448, 128, 1408] + - [39, 8991.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [65, 11700.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [53, 7660.0] + - - [704, 1408, 1, 128, 704, 704, 128, 1408] + - [38, 11726.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 448] + - [62, 12789.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [39, 7780.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [43, 13027.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [40, 11169.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [68, 9063.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [41, 16169.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [58, 14174.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [42, 11250.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [73, 13222.0] + - - [256, 2368, 1, 128, 256, 256, 128, 2368] + - [55, 8660.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [77, 12985.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [51, 11483.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [62, 9822.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [59, 14294.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [40, 10509.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [65, 13996.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [64, 13972.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [46, 8643.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [77, 11098.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [64, 13335.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 448] + - [46, 11251.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [74, 8527.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 256] + - [55, 8699.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [68, 9043.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [54, 10972.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [59, 14306.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [40, 9595.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [53, 8947.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [56, 10647.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [65, 16527.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [52, 14690.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 256] + - [45, 7442.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [53, 10225.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [37, 7490.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [67, 10938.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [55, 10594.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [59, 14604.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [56, 14236.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [64, 11651.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [51, 14676.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [45, 7684.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 704] + - [46, 10207.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [71, 12156.0] + - - [704, 1024, 1, 128, 704, 704, 128, 1024] + - [39, 10207.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [49, 10408.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 448] + - [70, 8951.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [71, 13821.0] + - - [704, 448, 1, 128, 704, 704, 128, 448] + - [37, 6706.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [49, 11972.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [70, 8999.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [60, 10646.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [75, 12808.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [44, 12463.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [49, 11487.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [51, 13531.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [78, 14805.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [75, 13918.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [62, 13069.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [37, 8697.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [64, 11464.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [61, 9109.0] + - - [64, 5888, 1, 128, 64, 64, 128, 5888] + - [37, 7755.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [54, 11722.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [53, 7719.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [76, 10572.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [40, 11947.0] + - - [704, 704, 1, 128, 704, 704, 128, 704] + - [37, 8504.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [42, 10419.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [53, 6339.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [70, 9103.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [78, 15584.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [72, 12378.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [71, 11239.0] + - - [256, 3584, 1, 128, 256, 256, 128, 3584] + - [40, 10996.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 256] + - [54, 12636.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [64, 9841.0] + - - [256, 2944, 1, 128, 256, 256, 128, 2944] + - [40, 10555.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [68, 10431.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [58, 12823.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [78, 16896.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 704] + - [54, 12845.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [48, 9172.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [64, 10182.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [51, 10518.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [46, 7893.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [64, 9923.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [51, 14358.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [42, 14108.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 256] + - [53, 8174.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [63, 9909.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [75, 9471.0] + - - [64, 6784, 1, 128, 64, 64, 128, 6784] + - [61, 7740.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [45, 9949.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [37, 7978.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [45, 8705.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [71, 14067.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [53, 10171.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 1024] + - [47, 12405.0] + - - [64, 5056, 1, 128, 64, 64, 128, 5056] + - [37, 6746.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 64] + - [45, 7286.0] + - - [448, 704, 1, 128, 448, 448, 128, 704] + - [37, 6728.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 704] + - [71, 11705.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [64, 11426.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [77, 13328.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [72, 14122.0] + - - [256, 1408, 1, 128, 256, 256, 128, 1408] + - [37, 7418.0] + - - [256, 4288, 1, 128, 256, 256, 128, 4288] + - [40, 12750.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [46, 10307.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [44, 12864.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [39, 10355.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 64] + - [74, 7697.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [49, 12587.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [37, 8641.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [65, 13829.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [52, 11655.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [58, 14453.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [46, 9674.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [63, 10263.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [37, 6381.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [54, 13809.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [59, 12963.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [57, 11900.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [44, 12879.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [79, 10368.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [41, 13000.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [37, 8722.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [75, 14290.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [65, 13154.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [75, 12916.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [49, 14096.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [68, 9063.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [51, 14414.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [78, 14619.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [49, 14500.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [116, 3525.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [96, 5110.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 64] + - [80, 2658.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [103, 4128.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [105, 4853.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [107, 7167.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [117, 5095.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [96, 4210.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [92, 7498.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [107, 6457.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [107, 6261.0] + - - [704, 256, 1, 128, 704, 704, 128, 256] + - [109, 4241.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [109, 4119.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [117, 7346.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [107, 6168.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [105, 5254.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [83, 6017.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [81, 5161.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [107, 6225.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [88, 5604.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [109, 5143.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [117, 3600.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [109, 4294.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [117, 6530.0] + - - [64, 2944, 1, 128, 64, 64, 128, 2944] + - [81, 4246.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [105, 6140.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [96, 5900.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [83, 6362.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [117, 4000.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [99, 4969.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [109, 4447.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [109, 4950.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 64] + - [107, 5611.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [114, 5166.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [109, 4951.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [114, 4190.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [109, 4047.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 256] + - [99, 5412.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 64] + - [116, 4102.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [107, 4033.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [88, 3305.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [92, 5066.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [117, 6388.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [99, 5457.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 64] + - [117, 4408.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [83, 7032.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [92, 6129.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [83, 6484.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [90, 5810.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [83, 7474.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [81, 4087.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [88, 4919.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 64] + - [109, 3464.0] + - - [64, 1408, 1, 128, 64, 64, 128, 1408] + - [80, 2773.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [109, 6129.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [110, 4942.0] + - - [448, 256, 1, 128, 448, 448, 128, 256] + - [107, 3164.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [109, 4518.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [80, 2708.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [117, 4973.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [109, 5942.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [103, 5104.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 64] + - [109, 3208.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [80, 2720.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [109, 6049.0] + - - [256, 448, 1, 128, 256, 256, 128, 448] + - [110, 3164.0] + - - [64, 3584, 1, 128, 64, 64, 128, 3584] + - [83, 4782.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [90, 4474.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [105, 5603.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [96, 6016.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [87, 3231.0] + - - [64, 1856, 1, 128, 64, 64, 128, 1856] + - [81, 3249.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [109, 4199.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [109, 5798.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [99, 4953.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [103, 4984.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [83, 5734.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [83, 7330.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [103, 4335.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [110, 5661.0] + - - [256, 704, 1, 128, 256, 256, 128, 704] + - [109, 4075.0] + - - [256, 1024, 1, 128, 256, 256, 128, 1024] + - [99, 5395.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [96, 4213.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [105, 6016.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [107, 7026.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [107, 4498.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [92, 6281.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [88, 4857.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [83, 7160.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [107, 5244.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [92, 4112.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [103, 3258.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [99, 6166.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [92, 5271.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [97, 5380.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [109, 6112.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [109, 4516.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [99, 5652.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [83, 6505.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [109, 5924.0] + - - [64, 2368, 1, 128, 64, 64, 128, 2368] + - [109, 3606.0] + - - [64, 4288, 1, 128, 64, 64, 128, 4288] + - [99, 5666.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [107, 5642.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [88, 3277.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [116, 5788.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [109, 5864.0] + - - [448, 448, 1, 128, 448, 448, 128, 448] + - [110, 4539.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [109, 4982.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [107, 6704.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [117, 5731.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 4] + - [121, 824.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 4] + - [121, 542.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 4] + - [119, 586.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 4] + - [122, 1004.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 4] + - [121, 514.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 4] + - [121, 421.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 4] + - [121, 501.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 4] + - [122, 734.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 4] + - [122, 842.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 4] + - [122, 685.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 4] + - [122, 452.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 4] + - [122, 288.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 4] + - [122, 981.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 4] + - [120, 365.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 4] + - [122, 718.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 4] + - [122, 1080.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 4] + - [119, 625.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 4] + - [122, 1037.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 4] + - [119, 710.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 4] + - [122, 348.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 4] + - [119, 278.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 4] + - [121, 400.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 4] + - [121, 784.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 4] + - [122, 581.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 4] + - [120, 480.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 4] + - [121, 628.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 4] + - [121, 569.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 4] + - [119, 1032.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 4] + - [119, 944.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 4] + - [122, 374.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 4] + - [104, 212.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 4] + - [119, 768.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 4] + - [122, 1005.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 4] + - [119, 704.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 4] + - [122, 826.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 4] + - [121, 1015.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 1856] + - [133, 741.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 2944] + - [126, 964.0] + - - [4, 1408, 1, 128, 4, 4, 128, 1408] + - [87, 215.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 2368] + - [126, 778.0] + - - [4, 3584, 1, 128, 4, 4, 128, 3584] + - [132, 501.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 5888] + - [129, 1492.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 1408] + - [130, 562.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 6784] + - [131, 1714.0] + - - [4, 4288, 1, 128, 4, 4, 128, 4288] + - [132, 597.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 5056] + - [132, 1374.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 6784] + - [127, 1614.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 2944] + - [133, 1059.0] + - - [4, 5056, 1, 256, 4, 4, 256, 5056] + - [124, 952.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 5056] + - [124, 1296.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 2368] + - [130, 858.0] + - - [4, 1856, 1, 256, 4, 4, 256, 1856] + - [125, 404.0] + - - [4, 2368, 1, 256, 4, 4, 256, 2368] + - [124, 474.0] + - - [4, 2944, 1, 256, 4, 4, 256, 2944] + - [129, 596.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 4288] + - [126, 1252.0] + - - [4, 6784, 1, 128, 4, 4, 128, 6784] + - [123, 886.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 3584] + - [133, 1058.0] + - - [4, 5888, 1, 256, 4, 4, 256, 5888] + - [132, 1061.0] + - - [4, 6784, 1, 256, 4, 4, 256, 6784] + - [128, 1106.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1408] + - [126, 494.0] + - - [4, 3584, 1, 256, 4, 4, 256, 3584] + - [124, 708.0] + - - [4, 1408, 1, 256, 4, 4, 256, 1408] + - [125, 311.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 4288] + - [126, 1371.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 5888] + - [129, 1418.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1856] + - [126, 649.0] + - - [4, 1856, 1, 128, 4, 4, 128, 1856] + - [87, 283.0] + - - [4, 2944, 1, 128, 4, 4, 128, 2944] + - [87, 423.0] + - - [4, 5056, 1, 128, 4, 4, 128, 5056] + - [132, 688.0] + - - [4, 4288, 1, 256, 4, 4, 256, 4288] + - [124, 832.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3584] + - [126, 1167.0] + - - [4, 5888, 1, 128, 4, 4, 128, 5888] + - [129, 802.0] + - - [4, 2368, 1, 128, 4, 4, 128, 2368] + - [87, 355.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 704] + - [86, 226.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [104, 477.0] + - - [64, 4, 1, 256, 64, 64, 256, 4] + - [93, 14.0] + - - [64, 704, 1, 128, 64, 64, 128, 704] + - [87, 1780.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [100, 2095.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 4] + - [86, 43.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [82, 3495.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [84, 2866.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 64] + - [106, 2305.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [82, 3622.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [88, 3654.0] + - - [4, 704, 1, 256, 4, 4, 256, 704] + - [93, 152.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 4] + - [93, 243.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [87, 1549.0] + - - [64, 1024, 1, 128, 64, 64, 128, 1024] + - [82, 2383.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 64] + - [112, 22.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [85, 2522.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [85, 2083.0] + - - [448, 4, 1, 256, 448, 448, 256, 4] + - [93, 100.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 4] + - [93, 153.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [87, 20.0] + - - [256, 4, 1, 128, 256, 256, 128, 4] + - [87, 41.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [100, 3001.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [87, 481.0] + - - [704, 64, 1, 128, 704, 704, 128, 64] + - [87, 1848.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 4] + - [93, 228.0] + - - [256, 256, 1, 128, 256, 256, 128, 256] + - [98, 2370.0] + - - [64, 256, 1, 128, 64, 64, 128, 256] + - [115, 718.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [102, 2945.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [82, 2594.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [85, 2378.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [93, 2238.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [113, 1336.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [112, 780.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [86, 1469.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [87, 1755.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [84, 2220.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [82, 3612.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 4] + - [94, 357.0] + - - [4, 4, 1, 256, 4, 4, 256, 4] + - [80, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [84, 958.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [85, 2370.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [86, 699.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 448] + - [86, 169.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [82, 2873.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 4] + - [101, 86.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [87, 2335.0] + - - [4, 704, 1, 128, 4, 4, 128, 704] + - [93, 108.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [87, 2558.0] + - - [448, 64, 1, 128, 448, 448, 128, 64] + - [87, 1184.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 448] + - [86, 151.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [82, 2844.0] + - - [256, 64, 1, 128, 256, 256, 128, 64] + - [87, 677.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 1024] + - [86, 367.0] + - - [704, 4, 1, 128, 704, 704, 128, 4] + - [113, 111.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [113, 56.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 4] + - [86, 96.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [93, 55.0] + - - [4, 4, 1, 128, 4, 4, 128, 4] + - [80, 1.0] + - - [4, 128, 1, 256, 4, 4, 256, 128] + - [93, 28.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [112, 351.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [108, 3321.0] + - - [4, 448, 1, 128, 4, 4, 128, 448] + - [104, 70.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [118, 1312.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 128] + - [94, 48.0] + - - [64, 4, 1, 128, 64, 64, 128, 4] + - [87, 10.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [93, 240.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 704] + - [94, 253.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 4] + - [80, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [84, 699.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 4] + - [89, 159.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [86, 390.0] + - - [4, 64, 1, 128, 4, 4, 128, 64] + - [87, 10.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [112, 703.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [93, 1336.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [87, 1770.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [93, 1372.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 4] + - [93, 342.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [87, 2316.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [108, 3211.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [86, 778.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [87, 1542.0] + - - [4, 256, 1, 128, 4, 4, 128, 256] + - [87, 39.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [106, 2787.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [104, 354.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 4] + - [80, 1.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1024] + - [94, 326.0] + - - [704, 4, 1, 256, 704, 704, 256, 4] + - [93, 155.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 4] + - [86, 48.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 4] + - [86, 168.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 4] + - [94, 253.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [108, 3189.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [91, 3564.0] + - - [4, 1024, 1, 128, 4, 4, 128, 1024] + - [93, 157.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [101, 1470.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [82, 2050.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [87, 1344.0] + - - [128, 4, 1, 256, 128, 128, 256, 4] + - [87, 28.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [82, 3489.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [85, 2520.0] + - - [448, 4, 1, 128, 448, 448, 128, 4] + - [87, 70.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 256] + - [101, 97.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [87, 20.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 256] + - [86, 86.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 4] + - [86, 24.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 64] + - [86, 24.0] + - - [4, 1024, 1, 256, 4, 4, 256, 1024] + - [93, 225.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [93, 936.0] + - - [4, 64, 1, 256, 4, 4, 256, 64] + - [93, 14.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [82, 2097.0] + - - [64, 448, 1, 128, 64, 64, 128, 448] + - [87, 1176.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [111, 3007.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [95, 3352.0] + - - [4, 448, 1, 256, 4, 4, 256, 448] + - [93, 98.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 128] + - [86, 43.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [87, 343.0] + - - [64, 64, 1, 128, 64, 64, 128, 64] + - [89, 176.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 4] + - [86, 21.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [112, 1467.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [104, 949.0] - null -- DeviceEfficiency diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB_GB.yaml new file mode 100644 index 000000000..41537f0d2 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bjlk_SB_GB.yaml @@ -0,0 +1,31912 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi31 +- gfx1100 +- [Device 744c] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [16, 19791.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [16, 18653.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [16, 18714.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [34, 18995.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [31, 15694.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [16, 20187.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [19, 17089.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 5056] + - [6, 17036.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [28, 19959.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [16, 19010.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [16, 17763.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [28, 19080.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [6, 15327.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [28, 16931.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [16, 19839.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [6, 14226.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 1408] + - [14, 14671.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [32, 18194.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [28, 20095.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [30, 13955.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [16, 18635.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [16, 19853.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [3, 16912.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [9, 15557.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [14, 14773.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [3, 18344.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [28, 17869.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [22, 15034.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [28, 16743.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [16, 20423.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [10, 19227.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 6784] + - [13, 18006.0] + - - [704, 5056, 1, 128, 704, 704, 128, 5056] + - [26, 14879.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [28, 18319.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [16, 20512.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [16, 18577.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [16, 19789.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [9, 15216.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [28, 19972.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [16, 19530.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [16, 16551.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 2944] + - [26, 16216.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [7, 19017.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [16, 17324.0] + - - [448, 5888, 1, 128, 448, 448, 128, 5888] + - [6, 13582.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [28, 18880.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [30, 13981.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [28, 20100.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 2944] + - [25, 18636.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [6, 15290.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [31, 17552.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 6784] + - [13, 17683.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [28, 17138.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [33, 15667.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [10, 18153.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [10, 18256.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [31, 16265.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [28, 20548.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [10, 16622.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [32, 14131.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [30, 13776.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [16, 18363.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [28, 18273.0] + - - [704, 2944, 1, 128, 704, 704, 128, 2944] + - [2, 14171.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [33, 15650.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [7, 16721.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [10, 16558.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 1408] + - [1, 17019.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [16, 19862.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [28, 18866.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [34, 19563.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 4288] + - [6, 16939.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [28, 19395.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [16, 19916.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [16, 20316.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [10, 19836.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [16, 19535.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [32, 16868.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [17, 20348.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [28, 18819.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [16, 19868.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [34, 18725.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 5888] + - [31, 18974.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [17, 17458.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 3584] + - [1, 17612.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [16, 20674.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [28, 19153.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [8, 17172.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 2368] + - [30, 16990.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [15, 15138.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [5, 17792.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [34, 19497.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [34, 16059.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [28, 18877.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [16, 16617.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [22, 16595.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [16, 19433.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [31, 17484.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 448] + - [30, 14234.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [3, 18065.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [16, 20579.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [16, 19173.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [5, 15019.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [3, 16792.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [28, 20065.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 5888] + - [31, 18715.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 3584] + - [26, 17133.0] + - - [448, 3584, 1, 128, 448, 448, 128, 3584] + - [0, 12190.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [28, 20090.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 5888] + - [31, 17811.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [28, 19955.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 704] + - [6, 13471.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [28, 20196.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 2368] + - [26, 17309.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 704] + - [30, 15382.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [10, 18869.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [28, 19893.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [28, 20545.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [29, 20041.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [32, 15008.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 5888] + - [0, 17173.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [28, 19900.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [28, 20050.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [7, 18731.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [28, 18226.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [3, 18676.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [2, 18403.0] + - - [256, 5056, 1, 128, 256, 256, 128, 5056] + - [24, 12984.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [10, 18077.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [16, 19112.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [32, 15522.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 1408] + - [8, 16662.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [16, 19310.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [33, 14229.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [3, 18746.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 2368] + - [7, 18094.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [10, 19085.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [17, 15497.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [16, 20017.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [16, 16312.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 2944] + - [34, 19218.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 1856] + - [16, 17351.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 1024] + - [18, 13943.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 3584] + - [3, 18485.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [28, 20653.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [28, 18467.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [28, 17459.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [16, 20122.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [30, 14310.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [33, 14861.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [18, 14222.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [28, 19501.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 704] + - [18, 14343.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [16, 19051.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 5888] + - [28, 19221.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [32, 16778.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [16, 19155.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [28, 19001.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [10, 18707.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 4288] + - [28, 18595.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 1856] + - [27, 14933.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [28, 19185.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [28, 20026.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 2368] + - [26, 17371.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 4288] + - [31, 16548.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [3, 17597.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [28, 18868.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 6784] + - [16, 19215.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [10, 19500.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [30, 14494.0] + - - [448, 4288, 1, 128, 448, 448, 128, 4288] + - [24, 12887.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [17, 19951.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [35, 16602.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 4288] + - [30, 16800.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [29, 17370.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [10, 19634.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [16, 19209.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [28, 18383.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [28, 18108.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [28, 19876.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 2368] + - [19, 17570.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [28, 19970.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [22, 19762.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 3584] + - [26, 16820.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [19, 15232.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 6784] + - [28, 19266.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [19, 18183.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [6, 15119.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [3, 19386.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [17, 20277.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 5888] + - [34, 19339.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 5888] + - [19, 18845.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [28, 18969.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [28, 18822.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [16, 18990.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [16, 20031.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [31, 16767.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 704] + - [30, 15780.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [22, 18474.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [5, 16740.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 2368] + - [2, 15955.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [16, 16265.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 6784] + - [28, 19282.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [33, 15027.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [5, 14013.0] + - - [256, 5888, 1, 128, 256, 256, 128, 5888] + - [26, 13492.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [28, 19891.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [34, 18760.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [16, 18059.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 4288] + - [31, 18567.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [17, 17767.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 704] + - [30, 14966.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 5056] + - [19, 18459.0] + - - [448, 5056, 1, 128, 448, 448, 128, 5056] + - [0, 12909.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 5056] + - [31, 17173.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 3584] + - [16, 18727.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [28, 18428.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [16, 20061.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 5056] + - [1, 17215.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [29, 20409.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [28, 20051.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [28, 19963.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [31, 16483.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [12, 15155.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [17, 20278.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [16, 20601.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [28, 20043.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 3584] + - [31, 18900.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 1856] + - [6, 14766.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [34, 17760.0] + - - [704, 3584, 1, 128, 704, 704, 128, 3584] + - [32, 14054.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [9, 15236.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [16, 19385.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 2944] + - [10, 18119.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [17, 19935.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [28, 19986.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [28, 19305.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [29, 16650.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 3584] + - [34, 18684.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [33, 15227.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [16, 19138.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [13, 16452.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [16, 20457.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [31, 18287.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [29, 15945.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 4288] + - [31, 18836.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [16, 17722.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [34, 18843.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [3, 17822.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [3, 19135.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 3584] + - [0, 16199.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 1408] + - [6, 14395.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 2944] + - [1, 16723.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 2944] + - [19, 18469.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [16, 20147.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 2368] + - [14, 15496.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 2368] + - [25, 17922.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [16, 20252.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [7, 18019.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [16, 19535.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [28, 18545.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 5056] + - [22, 18353.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [28, 18586.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [29, 19894.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [7, 15648.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [16, 20008.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [28, 18374.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 448] + - [30, 13665.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [16, 17639.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [28, 18125.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [1, 16883.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [17, 20517.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [28, 19846.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [16, 18033.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [28, 19868.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [3, 18984.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 1408] + - [26, 16186.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [28, 20424.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [34, 18904.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [3, 19105.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [10, 19406.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [10, 17742.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [31, 16664.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [28, 20018.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [16, 18164.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [36, 17155.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [28, 17519.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [3, 18657.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 2944] + - [16, 18259.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [17, 20444.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [28, 18981.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [16, 19237.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [34, 16072.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [29, 19067.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [34, 18012.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [28, 16692.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [33, 15871.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [28, 19055.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [28, 15795.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 2368] + - [0, 15009.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [16, 19539.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [0, 15771.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [10, 19653.0] + - - [448, 6784, 1, 128, 448, 448, 128, 6784] + - [6, 14044.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [16, 20008.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [28, 17474.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [17, 18885.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [16, 18995.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 448] + - [6, 13752.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [33, 13880.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [10, 17282.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [5, 18737.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [18, 15817.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [16, 18503.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [5, 15487.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [10, 18281.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [3, 17989.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [32, 16782.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [30, 15492.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 2368] + - [30, 16043.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 1408] + - [31, 18044.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [10, 19522.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [31, 17994.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 3584] + - [1, 18306.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [28, 19702.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 1024] + - [30, 13500.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [8, 15321.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [28, 19383.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [16, 19379.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [10, 15763.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [16, 18632.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [32, 15125.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [32, 15299.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [5, 17285.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 1024] + - [28, 17963.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [16, 16931.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 5056] + - [19, 18473.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [16, 18836.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [16, 19482.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 6784] + - [25, 18734.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [28, 20451.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 1856] + - [31, 16446.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 2944] + - [2, 16480.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 448] + - [30, 15196.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [28, 14768.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 1856] + - [26, 15991.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [32, 15213.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [10, 16010.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 1024] + - [32, 15860.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [3, 16124.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [16, 18868.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [16, 19485.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [32, 14166.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [28, 19765.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 1024] + - [32, 17044.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [29, 16962.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [29, 20246.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [3, 19684.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [10, 17534.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [16, 18589.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 1856] + - [18, 16534.0] + - - [256, 6784, 1, 128, 256, 256, 128, 6784] + - [35, 15331.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 6784] + - [13, 18888.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 5056] + - [13, 18397.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 5888] + - [1, 17433.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [28, 19651.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [28, 17828.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [16, 19762.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [17, 17041.0] + - - [704, 5888, 1, 128, 704, 704, 128, 5888] + - [26, 15424.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 3584] + - [31, 19126.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [17, 20030.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 1408] + - [30, 14402.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [10, 18227.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [28, 16832.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [18, 13132.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [31, 15727.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [29, 17800.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [0, 16005.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [29, 20170.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [34, 19847.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [34, 17290.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [28, 19384.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [7, 15273.0] + - - [704, 4288, 1, 128, 704, 704, 128, 4288] + - [8, 14386.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 4288] + - [26, 17441.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 6784] + - [10, 17855.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [29, 16342.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [29, 16039.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [20, 16842.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 5056] + - [25, 18718.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [18, 13404.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [16, 19445.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [19, 15203.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [16, 17583.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 256] + - [0, 11953.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 5888] + - [7, 17793.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [30, 14612.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 1856] + - [6, 14585.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [28, 20346.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [28, 17137.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [28, 19812.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [13, 18473.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [28, 18178.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [28, 19740.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [28, 18466.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [16, 19401.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [16, 16711.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [16, 18832.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 1024] + - [30, 15325.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [16, 19876.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [28, 20593.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 5056] + - [25, 18882.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [16, 19173.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [30, 13772.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [30, 13962.0] + - - [704, 2368, 1, 128, 704, 704, 128, 2368] + - [12, 12793.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 256] + - [4, 12914.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 1856] + - [26, 16900.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 4288] + - [31, 18347.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [5, 15945.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [28, 17901.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [28, 18561.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [9, 14466.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 6784] + - [28, 19001.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [16, 20350.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [31, 16785.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [16, 20000.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [28, 18511.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 5888] + - [34, 19253.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [34, 19558.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [32, 14356.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [16, 17329.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [3, 19032.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [7, 19126.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [28, 19016.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [21, 15868.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [16, 20428.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [17, 19158.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [6, 16092.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [17, 16813.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [31, 15809.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [16, 18475.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [18, 13470.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [28, 19581.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [29, 20600.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [3, 20151.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [16, 18914.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [33, 16079.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [7, 17608.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 1408] + - [28, 17645.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [28, 19413.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [10, 19173.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [31, 17428.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [19, 17316.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 1024] + - [11, 16863.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 256] + - [35, 15205.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 704] + - [30, 15688.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [17, 18356.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [29, 18983.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [16, 18583.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [28, 19505.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [17, 15909.0] + - - [704, 6784, 1, 128, 704, 704, 128, 6784] + - [20, 15862.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [16, 19435.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [34, 20304.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [28, 19468.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 6784] + - [25, 18281.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [5, 15610.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [22, 19349.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [3, 16753.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 5056] + - [19, 17115.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [16, 20559.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [30, 14736.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 448] + - [6, 14808.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 2944] + - [10, 18547.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [32, 18009.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [3, 19467.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [28, 18200.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 4288] + - [28, 18211.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [29, 20162.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [30, 14057.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 1408] + - [31, 16971.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [16, 19016.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [16, 16764.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [28, 18562.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [10, 16093.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [16, 19073.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 2944] + - [12, 15276.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [3, 17646.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [17, 20188.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [17, 16998.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [28, 19706.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [2, 18085.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [31, 17850.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 1856] + - [10, 17664.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 704] + - [30, 14156.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [2, 15835.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [16, 20070.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 1408] + - [26, 15760.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 1024] + - [30, 13981.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [29, 20366.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [28, 18833.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [16, 18165.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 4288] + - [31, 18139.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [28, 19147.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [32, 16614.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [12, 15285.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 1856] + - [19, 17213.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [10, 18426.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [28, 19893.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [16, 15652.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [16, 20699.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [5, 16516.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [34, 16930.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 1024] + - [2, 16253.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [34, 18886.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [29, 19142.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [34, 20260.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [34, 20664.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [23, 17850.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [23, 20437.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [22, 20170.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [29, 20673.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [12, 15534.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [29, 19423.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [16, 17793.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [32, 19142.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [25, 19228.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [13, 19836.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [28, 19805.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [28, 20194.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [17, 20723.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [3, 20397.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [28, 20382.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [28, 20511.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [34, 20680.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [28, 20589.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [17, 20576.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [28, 20655.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [28, 20756.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [67, 14630.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [72, 13695.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 448] + - [69, 12146.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [41, 17035.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [59, 14428.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [72, 11896.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [37, 10379.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [51, 14043.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [70, 9184.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [61, 7376.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [68, 7844.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [47, 12150.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [49, 12188.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [49, 11844.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [65, 13722.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [45, 7537.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [63, 10266.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [69, 13745.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [69, 11302.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [78, 15257.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [45, 8307.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [62, 11736.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [72, 15338.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 256] + - [46, 10285.0] + - - [448, 2944, 1, 128, 448, 448, 128, 2944] + - [69, 13251.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [68, 8781.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [63, 8786.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [75, 10285.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [61, 9261.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 448] + - [38, 7935.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [72, 14322.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [45, 8903.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [46, 11785.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [71, 9342.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 256] + - [64, 10834.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [52, 13684.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [45, 8537.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [78, 12067.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [39, 10597.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [69, 12848.0] + - - [448, 1856, 1, 128, 448, 448, 128, 1856] + - [38, 11298.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [54, 13048.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [40, 12211.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [44, 13349.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [41, 15015.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [65, 14161.0] + - - [704, 1856, 1, 128, 704, 704, 128, 1856] + - [69, 12965.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [44, 12539.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [39, 10355.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [46, 8938.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [54, 11791.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [57, 12669.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [64, 13978.0] + - - [448, 2368, 1, 128, 448, 448, 128, 2368] + - [38, 12124.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [78, 12973.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [54, 14263.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [39, 8966.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [49, 13415.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [62, 9775.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [49, 14724.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [68, 8947.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [52, 12350.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [50, 14054.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [54, 11577.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [41, 15738.0] + - - [448, 1024, 1, 128, 448, 448, 128, 1024] + - [37, 7978.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [66, 14252.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 64] + - [45, 6595.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [62, 11578.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [57, 13401.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [75, 9654.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [49, 16523.0] + - - [256, 1856, 1, 128, 256, 256, 128, 1856] + - [71, 8196.0] + - - [448, 1408, 1, 128, 448, 448, 128, 1408] + - [39, 8991.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [65, 11700.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [53, 7660.0] + - - [704, 1408, 1, 128, 704, 704, 128, 1408] + - [38, 11726.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 448] + - [62, 12789.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [39, 7780.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [43, 13027.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [40, 11169.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [68, 9063.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [41, 16169.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [58, 14174.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [42, 11250.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [73, 13222.0] + - - [256, 2368, 1, 128, 256, 256, 128, 2368] + - [55, 8660.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [77, 12985.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [51, 11483.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [62, 9822.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [59, 14294.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [40, 10509.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [65, 13996.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [64, 13972.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [46, 8643.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [77, 11098.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [64, 13335.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 448] + - [46, 11251.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [74, 8527.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 256] + - [55, 8699.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [68, 9043.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [54, 10972.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [59, 14306.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [40, 9595.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [53, 8947.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [56, 10647.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [65, 16527.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [52, 14690.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 256] + - [45, 7442.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [53, 10225.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [37, 7490.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [67, 10938.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [55, 10594.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [59, 14604.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [56, 14236.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [64, 11651.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [51, 14676.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [45, 7684.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 704] + - [46, 10207.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [71, 12156.0] + - - [704, 1024, 1, 128, 704, 704, 128, 1024] + - [39, 10207.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [49, 10408.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 448] + - [70, 8951.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [71, 13821.0] + - - [704, 448, 1, 128, 704, 704, 128, 448] + - [37, 6706.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [49, 11972.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [70, 8999.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [60, 10646.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [75, 12808.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [44, 12463.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [49, 11487.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [51, 13531.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [78, 14805.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [75, 13918.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [62, 13069.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [37, 8697.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [64, 11464.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [61, 9109.0] + - - [64, 5888, 1, 128, 64, 64, 128, 5888] + - [37, 7755.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [54, 11722.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [53, 7719.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [76, 10572.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [40, 11947.0] + - - [704, 704, 1, 128, 704, 704, 128, 704] + - [37, 8504.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [42, 10419.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [53, 6339.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [70, 9103.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [78, 15584.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [72, 12378.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [71, 11239.0] + - - [256, 3584, 1, 128, 256, 256, 128, 3584] + - [40, 10996.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 256] + - [54, 12636.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [64, 9841.0] + - - [256, 2944, 1, 128, 256, 256, 128, 2944] + - [40, 10555.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [68, 10431.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [58, 12823.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [78, 16896.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 704] + - [54, 12845.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [48, 9172.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [64, 10182.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [51, 10518.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [46, 7893.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [64, 9923.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [51, 14358.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [42, 14108.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 256] + - [53, 8174.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [63, 9909.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [75, 9471.0] + - - [64, 6784, 1, 128, 64, 64, 128, 6784] + - [61, 7740.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [45, 9949.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [37, 7978.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [45, 8705.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [71, 14067.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [53, 10171.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 1024] + - [47, 12405.0] + - - [64, 5056, 1, 128, 64, 64, 128, 5056] + - [37, 6746.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 64] + - [45, 7286.0] + - - [448, 704, 1, 128, 448, 448, 128, 704] + - [37, 6728.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 704] + - [71, 11705.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [64, 11426.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [77, 13328.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [72, 14122.0] + - - [256, 1408, 1, 128, 256, 256, 128, 1408] + - [37, 7418.0] + - - [256, 4288, 1, 128, 256, 256, 128, 4288] + - [40, 12750.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [46, 10307.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [44, 12864.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [39, 10355.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 64] + - [74, 7697.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [49, 12587.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [37, 8641.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [65, 13829.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [52, 11655.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [58, 14453.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [46, 9674.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [63, 10263.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [37, 6381.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [54, 13809.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [59, 12963.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [57, 11900.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [44, 12879.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [79, 10368.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [41, 13000.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [37, 8722.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [75, 14290.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [65, 13154.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [75, 12916.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [49, 14096.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [68, 9063.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [51, 14414.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [78, 14619.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [49, 14500.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [116, 3525.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [96, 5110.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 64] + - [80, 2658.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [103, 4128.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [105, 4853.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [107, 7167.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [117, 5095.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [96, 4210.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [92, 7498.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [107, 6457.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [107, 6261.0] + - - [704, 256, 1, 128, 704, 704, 128, 256] + - [109, 4241.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [109, 4119.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [117, 7346.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [107, 6168.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [105, 5254.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [83, 6017.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [81, 5161.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [107, 6225.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [88, 5604.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [109, 5143.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [117, 3600.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [109, 4294.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [117, 6530.0] + - - [64, 2944, 1, 128, 64, 64, 128, 2944] + - [81, 4246.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [105, 6140.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [96, 5900.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [83, 6362.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [117, 4000.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [99, 4969.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [109, 4447.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [109, 4950.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 64] + - [107, 5611.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [114, 5166.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [109, 4951.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [114, 4190.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [109, 4047.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 256] + - [99, 5412.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 64] + - [116, 4102.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [107, 4033.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [88, 3305.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [92, 5066.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [117, 6388.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [99, 5457.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 64] + - [117, 4408.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [83, 7032.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [92, 6129.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [83, 6484.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [90, 5810.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [83, 7474.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [81, 4087.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [88, 4919.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 64] + - [109, 3464.0] + - - [64, 1408, 1, 128, 64, 64, 128, 1408] + - [80, 2773.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [109, 6129.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [110, 4942.0] + - - [448, 256, 1, 128, 448, 448, 128, 256] + - [107, 3164.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [109, 4518.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [80, 2708.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [117, 4973.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [109, 5942.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [103, 5104.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 64] + - [109, 3208.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [80, 2720.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [109, 6049.0] + - - [256, 448, 1, 128, 256, 256, 128, 448] + - [110, 3164.0] + - - [64, 3584, 1, 128, 64, 64, 128, 3584] + - [83, 4782.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [90, 4474.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [105, 5603.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [96, 6016.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [87, 3231.0] + - - [64, 1856, 1, 128, 64, 64, 128, 1856] + - [81, 3249.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [109, 4199.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [109, 5798.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [99, 4953.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [103, 4984.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [83, 5734.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [83, 7330.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [103, 4335.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [110, 5661.0] + - - [256, 704, 1, 128, 256, 256, 128, 704] + - [109, 4075.0] + - - [256, 1024, 1, 128, 256, 256, 128, 1024] + - [99, 5395.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [96, 4213.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [105, 6016.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [107, 7026.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [107, 4498.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [92, 6281.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [88, 4857.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [83, 7160.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [107, 5244.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [92, 4112.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [103, 3258.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [99, 6166.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [92, 5271.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [97, 5380.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [109, 6112.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [109, 4516.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [99, 5652.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [83, 6505.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [109, 5924.0] + - - [64, 2368, 1, 128, 64, 64, 128, 2368] + - [109, 3606.0] + - - [64, 4288, 1, 128, 64, 64, 128, 4288] + - [99, 5666.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [107, 5642.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [88, 3277.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [116, 5788.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [109, 5864.0] + - - [448, 448, 1, 128, 448, 448, 128, 448] + - [110, 4539.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [109, 4982.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [107, 6704.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [117, 5731.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 4] + - [121, 824.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 4] + - [121, 542.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 4] + - [119, 586.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 4] + - [122, 1004.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 4] + - [121, 514.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 4] + - [121, 421.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 4] + - [121, 501.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 4] + - [122, 734.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 4] + - [122, 842.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 4] + - [122, 685.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 4] + - [122, 452.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 4] + - [122, 288.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 4] + - [122, 981.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 4] + - [120, 365.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 4] + - [122, 718.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 4] + - [122, 1080.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 4] + - [119, 625.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 4] + - [122, 1037.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 4] + - [119, 710.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 4] + - [122, 348.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 4] + - [119, 278.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 4] + - [121, 400.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 4] + - [121, 784.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 4] + - [122, 581.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 4] + - [120, 480.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 4] + - [121, 628.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 4] + - [121, 569.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 4] + - [119, 1032.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 4] + - [119, 944.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 4] + - [122, 374.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 4] + - [104, 212.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 4] + - [119, 768.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 4] + - [122, 1005.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 4] + - [119, 704.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 4] + - [122, 826.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 4] + - [121, 1015.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 1856] + - [133, 741.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 2944] + - [126, 964.0] + - - [4, 1408, 1, 128, 4, 4, 128, 1408] + - [87, 215.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 2368] + - [126, 778.0] + - - [4, 3584, 1, 128, 4, 4, 128, 3584] + - [132, 501.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 5888] + - [129, 1492.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 1408] + - [130, 562.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 6784] + - [131, 1714.0] + - - [4, 4288, 1, 128, 4, 4, 128, 4288] + - [132, 597.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 5056] + - [132, 1374.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 6784] + - [127, 1614.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 2944] + - [133, 1059.0] + - - [4, 5056, 1, 256, 4, 4, 256, 5056] + - [124, 952.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 5056] + - [124, 1296.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 2368] + - [130, 858.0] + - - [4, 1856, 1, 256, 4, 4, 256, 1856] + - [125, 404.0] + - - [4, 2368, 1, 256, 4, 4, 256, 2368] + - [124, 474.0] + - - [4, 2944, 1, 256, 4, 4, 256, 2944] + - [129, 596.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 4288] + - [126, 1252.0] + - - [4, 6784, 1, 128, 4, 4, 128, 6784] + - [123, 886.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 3584] + - [133, 1058.0] + - - [4, 5888, 1, 256, 4, 4, 256, 5888] + - [132, 1061.0] + - - [4, 6784, 1, 256, 4, 4, 256, 6784] + - [128, 1106.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1408] + - [126, 494.0] + - - [4, 3584, 1, 256, 4, 4, 256, 3584] + - [124, 708.0] + - - [4, 1408, 1, 256, 4, 4, 256, 1408] + - [125, 311.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 4288] + - [126, 1371.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 5888] + - [129, 1418.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1856] + - [126, 649.0] + - - [4, 1856, 1, 128, 4, 4, 128, 1856] + - [87, 283.0] + - - [4, 2944, 1, 128, 4, 4, 128, 2944] + - [87, 423.0] + - - [4, 5056, 1, 128, 4, 4, 128, 5056] + - [132, 688.0] + - - [4, 4288, 1, 256, 4, 4, 256, 4288] + - [124, 832.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3584] + - [126, 1167.0] + - - [4, 5888, 1, 128, 4, 4, 128, 5888] + - [129, 802.0] + - - [4, 2368, 1, 128, 4, 4, 128, 2368] + - [87, 355.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 704] + - [86, 226.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [104, 477.0] + - - [64, 4, 1, 256, 64, 64, 256, 4] + - [93, 14.0] + - - [64, 704, 1, 128, 64, 64, 128, 704] + - [87, 1780.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [100, 2095.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 4] + - [86, 43.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [82, 3495.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [84, 2866.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 64] + - [106, 2305.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [82, 3622.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [88, 3654.0] + - - [4, 704, 1, 256, 4, 4, 256, 704] + - [93, 152.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 4] + - [93, 243.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [87, 1549.0] + - - [64, 1024, 1, 128, 64, 64, 128, 1024] + - [82, 2383.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 64] + - [112, 22.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [85, 2522.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [85, 2083.0] + - - [448, 4, 1, 256, 448, 448, 256, 4] + - [93, 100.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 4] + - [93, 153.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [87, 20.0] + - - [256, 4, 1, 128, 256, 256, 128, 4] + - [87, 41.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [100, 3001.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [87, 481.0] + - - [704, 64, 1, 128, 704, 704, 128, 64] + - [87, 1848.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 4] + - [93, 228.0] + - - [256, 256, 1, 128, 256, 256, 128, 256] + - [98, 2370.0] + - - [64, 256, 1, 128, 64, 64, 128, 256] + - [115, 718.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [102, 2945.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [82, 2594.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [85, 2378.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [93, 2238.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [113, 1336.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [112, 780.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [86, 1469.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [87, 1755.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [84, 2220.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [82, 3612.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 4] + - [94, 357.0] + - - [4, 4, 1, 256, 4, 4, 256, 4] + - [80, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [84, 958.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [85, 2370.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [86, 699.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 448] + - [86, 169.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [82, 2873.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 4] + - [101, 86.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [87, 2335.0] + - - [4, 704, 1, 128, 4, 4, 128, 704] + - [93, 108.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [87, 2558.0] + - - [448, 64, 1, 128, 448, 448, 128, 64] + - [87, 1184.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 448] + - [86, 151.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [82, 2844.0] + - - [256, 64, 1, 128, 256, 256, 128, 64] + - [87, 677.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 1024] + - [86, 367.0] + - - [704, 4, 1, 128, 704, 704, 128, 4] + - [113, 111.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [113, 56.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 4] + - [86, 96.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [93, 55.0] + - - [4, 4, 1, 128, 4, 4, 128, 4] + - [80, 1.0] + - - [4, 128, 1, 256, 4, 4, 256, 128] + - [93, 28.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [112, 351.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [108, 3321.0] + - - [4, 448, 1, 128, 4, 4, 128, 448] + - [104, 70.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [118, 1312.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 128] + - [94, 48.0] + - - [64, 4, 1, 128, 64, 64, 128, 4] + - [87, 10.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [93, 240.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 704] + - [94, 253.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 4] + - [80, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [84, 699.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 4] + - [89, 159.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [86, 390.0] + - - [4, 64, 1, 128, 4, 4, 128, 64] + - [87, 10.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [112, 703.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [93, 1336.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [87, 1770.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [93, 1372.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 4] + - [93, 342.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [87, 2316.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [108, 3211.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [86, 778.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [87, 1542.0] + - - [4, 256, 1, 128, 4, 4, 128, 256] + - [87, 39.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [106, 2787.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [104, 354.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 4] + - [80, 1.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1024] + - [94, 326.0] + - - [704, 4, 1, 256, 704, 704, 256, 4] + - [93, 155.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 4] + - [86, 48.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 4] + - [86, 168.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 4] + - [94, 253.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [108, 3189.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [91, 3564.0] + - - [4, 1024, 1, 128, 4, 4, 128, 1024] + - [93, 157.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [101, 1470.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [82, 2050.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [87, 1344.0] + - - [128, 4, 1, 256, 128, 128, 256, 4] + - [87, 28.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [82, 3489.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [85, 2520.0] + - - [448, 4, 1, 128, 448, 448, 128, 4] + - [87, 70.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 256] + - [101, 97.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [87, 20.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 256] + - [86, 86.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 4] + - [86, 24.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 64] + - [86, 24.0] + - - [4, 1024, 1, 256, 4, 4, 256, 1024] + - [93, 225.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [93, 936.0] + - - [4, 64, 1, 256, 4, 4, 256, 64] + - [93, 14.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [82, 2097.0] + - - [64, 448, 1, 128, 64, 64, 128, 448] + - [87, 1176.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [111, 3007.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [95, 3352.0] + - - [4, 448, 1, 256, 4, 4, 256, 448] + - [93, 98.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 128] + - [86, 43.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [87, 343.0] + - - [64, 64, 1, 128, 64, 64, 128, 64] + - [89, 176.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 4] + - [86, 21.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [112, 1467.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [104, 949.0] +- null diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB.yaml index 02e9a173c..6f6f9b7d7 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB.yaml @@ -1,7 +1,7 @@ -- {MinimumRequiredVersion: 4.33.0} +- {MinimumRequiredVersion: 4.26.0} - navi31 - gfx1100 -- [Device 6863] +- [Device 744c] - AllowNoFreeDims: false AssignedDerivedParameters: true Batched: true @@ -11,7 +11,6 @@ ConvolutionConfig: [] DataType: 0 DestDataType: 0 - Fp16AltImpl: false HighPrecisionAccumulate: false Index0: 0 Index01A: 0 @@ -54,6 +53,69698 @@ ZeroPadA: [] ZeroPadB: [] - - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_SN_SU0_SUM0_SUS0_TT4_8_WG32_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_4_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_4_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_SUS128_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_SUS0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_SUS0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_SUS0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_SUS128_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_SUS128_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_SUS0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_SUS0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_SUS128_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_SUS128_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 AggressivePerfMode: 1 AssertAlphaValue: false AssertBetaValue: false @@ -91,7 +69782,7 @@ DisableVgprOverlapping: false EdgeType: ShiftPtr EnableMatrixInstruction: false - ExpandPointerSwap: 0 + ExpandPointerSwap: true Fp16AltImpl: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 @@ -118,20 +69809,24 @@ KernelLanguage: Assembly LSCA: 8 LSCB: 8 - LSPA: 32 - LSPB: 32 + LSPA: 8 + LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 32 - LVPB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsBlockSizePerPad: 0 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsInitCVgprs: false LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -150,10 +69845,10 @@ LoopUnroll: 8 MACInstruction: FMA MIArchVgpr: false - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MagicDivAlg: 2 @@ -171,13 +69866,13 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 OptNoLoadLoop: 1 OptPreLoopVmcnt: 0 PackBatchDims: 0 @@ -195,7 +69890,7 @@ PersistentKernelAlongBatch: false PrefetchAcrossPersistent: 0 PrefetchAcrossPersistentMode: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AllowNoFreeDims: false @@ -253,12 +69948,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_ + SolutionIndex: 306 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_SUS128_TT2_2_WG8_8_1_WGM4 SourceSwap: false + SplitGlobalRead: 1 StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUMapping: 3 + StaggerUStride: 128 StoreCInUnroll: false StoreCInUnrollExact: false StoreCInUnrollInterval: 1 @@ -267,10 +69963,10 @@ StoreRemapVectorWidth: 0 StoreSyncOpt: 0 StoreVectorWidth: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -291,20 +69987,10820 @@ VectorWidth: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 - WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingType: B _DepthULds: 8 _GlobalAccumulation: null _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemC: 0 - _staggerStrideShift: 3 - allowLRVWforTLUandMI: false + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_SUS128_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + SplitGlobalRead: 1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + allowLRVWBforTLUandMI: false - [2, 3, 0, 1] -- - - [126, 126, 2, 66, 126, 126, 66, 66] - - [0, 0] -- null +- - - [1024, 4096, 1, 1024] + - [32, 77.958] + - - [4096, 4096, 1, 1024] + - [13, 87.004] + - - [1024, 4096, 1, 4096] + - [32, 80.855] + - - [30528, 4096, 1, 1024] + - [51, 90.275] + - - [1024, 2048, 1, 1024] + - [32, 75.26] + - - [4096, 2048, 1, 1024] + - [13, 82.772] + - - [1024, 2048, 1, 4096] + - [32, 79.298] + - - [30528, 2048, 1, 1024] + - [51, 89.603] + - - [30522, 320, 1, 768] + - [30, 72.183] + - - [3072, 4096, 1, 768] + - [15, 86.418] + - - [768, 4096, 1, 3072] + - [32, 79.862] + - - [768, 4096, 1, 768] + - [32, 77.182] + - - [30522, 160, 1, 768] + - [29, 59.6] + - - [30522, 640, 1, 768] + - [49, 86.147] + - - [30522, 1280, 1, 768] + - [43, 88.249] + - - [1024, 3072, 1, 1024] + - [32, 76.645] + - - [1024, 2048, 1, 3072] + - [32, 78.901] + - - [1024, 3072, 1, 3072] + - [15, 79.713] + - - [3072, 2048, 1, 1024] + - [30, 83.557] + - - [3072, 3072, 1, 1024] + - [32, 89.129] + - - [3072, 512, 1, 1024] + - [13, 70.22] + - - [30522, 160, 1, 1024] + - [12, 58.251] + - - [128, 128, 512, 64] + - [38, 62.366] + - - [512, 512, 64, 64] + - [38, 78.188] + - - [256, 256, 192, 64] + - [1, 74.606] + - - [256, 256, 96, 64] + - [5, 65.113] + - - [128, 128, 384, 64] + - [21, 60.353] + - - [128, 128, 96, 64] + - [19, 48.109] + - - [512, 512, 16, 64] + - [0, 65.113] + - - [512, 512, 96, 64] + - [38, 79.33] + - - [512, 512, 128, 64] + - [264, 67.527] + - - [2944, 4288, 1, 1280] + - [15, 87.658] + - - [2368, 5888, 1, 256] + - [30, 80.958] + - - [5888, 1856, 1, 256] + - [30, 79.285] + - - [512, 24000, 1, 1536] + - [49, 87.117] + - - [5888, 1408, 1, 256] + - [13, 78.603] + - - [5888, 1856, 1, 3328] + - [30, 82.578] + - - [5056, 704, 1, 256] + - [27, 65.885] + - - [5888, 2944, 1, 3328] + - [13, 87.753] + - - [1856, 4288, 1, 256] + - [13, 72.697] + - - [1024, 5056, 1, 128] + - [49, 71.326] + - - [5056, 5056, 1, 3328] + - [30, 86.571] + - - [1408, 5888, 1, 1280] + - [13, 82.587] + - - [2368, 6784, 1, 128] + - [22, 79.921] + - - [1024, 3584, 1, 3328] + - [49, 76.848] + - - [512, 48000, 1, 2048] + - [51, 89.883] + - - [5888, 1408, 1, 1280] + - [49, 82.456] + - - [1024, 2368, 1, 256] + - [29, 62.74] + - - [1408, 1856, 1, 1280] + - [49, 73.126] + - - [6144, 24000, 1, 2048] + - [13, 89.576] + - - [5056, 5056, 1, 1280] + - [13, 86.075] + - - [448, 5056, 1, 256] + - [10, 57.727] + - - [1760, 6400, 1, 1760] + - [37, 87.952] + - - [1856, 1408, 1, 128] + - [27, 63.146] + - - [6784, 256, 1, 3328] + - [14, 75.77] + - - [6784, 4288, 1, 3328] + - [7, 87.884] + - - [4288, 448, 1, 256] + - [27, 58.603] + - - [1856, 2368, 1, 3328] + - [13, 80.634] + - - [4288, 2944, 1, 1280] + - [30, 86.088] + - - [704, 5056, 1, 1280] + - [49, 73.329] + - - [2368, 704, 1, 3328] + - [29, 66.295] + - - [256, 5888, 1, 256] + - [10, 62.357] + - - [1856, 4288, 1, 3328] + - [30, 79.429] + - - [5888, 1024, 1, 256] + - [13, 77.119] + - - [16384, 3200, 1, 4096] + - [43, 79.727] + - - [1408, 2944, 1, 256] + - [47, 70.243] + - - [6784, 5056, 1, 3328] + - [32, 89.68] + - - [5056, 5056, 1, 256] + - [30, 83.616] + - - [1408, 6784, 1, 128] + - [22, 79.005] + - - [704, 5056, 1, 128] + - [10, 64.355] + - - [2368, 2944, 1, 1280] + - [49, 79.361] + - - [6784, 6784, 1, 1280] + - [41, 89.111] + - - [1408, 4288, 1, 1280] + - [49, 80.295] + - - [3584, 4288, 1, 1280] + - [30, 85.628] + - - [2368, 704, 1, 1280] + - [48, 64.987] + - - [5056, 4288, 1, 3328] + - [24, 88.457] + - - [3584, 2368, 1, 3328] + - [49, 84.577] + - - [6784, 448, 1, 1280] + - [30, 71.966] + - - [1408, 2944, 1, 128] + - [46, 69.711] + - - [4288, 2944, 1, 256] + - [13, 82.425] + - - [5888, 704, 1, 1280] + - [49, 75.025] + - - [448, 5888, 1, 128] + - [25, 58.86] + - - [5056, 2368, 1, 1280] + - [30, 81.919] + - - [448, 3584, 1, 1280] + - [15, 59.379] + - - [6784, 5888, 1, 256] + - [13, 86.512] + - - [1024, 1408, 1, 256] + - [12, 59.686] + - - [2368, 2368, 1, 3328] + - [30, 76.248] + - - [1856, 6784, 1, 128] + - [26, 77.751] + - - [5056, 704, 1, 3328] + - [30, 74.299] + - - [1408, 1856, 1, 256] + - [29, 66.868] + - - [2368, 5056, 1, 256] + - [30, 78.892] + - - [3584, 2368, 1, 1280] + - [49, 84.022] + - - [704, 5888, 1, 256] + - [15, 70.202] + - - [6784, 2944, 1, 128] + - [41, 84.956] + - - [2560, 1600, 1, 2560] + - [32, 78.093] + - - [4288, 6784, 1, 3328] + - [7, 87.753] + - - [2944, 6784, 1, 3328] + - [30, 89.039] + - - [6144, 5984, 1, 2048] + - [13, 88.335] + - - [3584, 704, 1, 3328] + - [13, 71.971] + - - [2048, 1600, 1, 512] + - [30, 73.433] + - - [448, 4288, 1, 256] + - [10, 60.687] + - - [1856, 4288, 1, 128] + - [46, 72.585] + - - [704, 2368, 1, 1280] + - [24, 60.886] + - - [1856, 2368, 1, 1280] + - [30, 79.46] + - - [1856, 4288, 1, 1280] + - [30, 78.883] + - - [704, 2944, 1, 128] + - [10, 61.066] + - - [1408, 1024, 1, 1280] + - [29, 66.931] + - - [704, 6784, 1, 256] + - [49, 71.845] + - - [6784, 704, 1, 256] + - [30, 71.75] + - - [5056, 1408, 1, 128] + - [9, 75.765] + - - [2048, 7000, 1, 2048] + - [13, 86.061] + - - [3584, 4288, 1, 3328] + - [49, 86.196] + - - [5888, 1856, 1, 1280] + - [49, 81.784] + - - [2368, 3584, 1, 1280] + - [49, 84.013] + - - [2368, 6784, 1, 1280] + - [30, 86.336] + - - [2944, 3584, 1, 3328] + - [13, 88.01] + - - [6784, 2944, 1, 256] + - [30, 85.804] + - - [4288, 2368, 1, 3328] + - [49, 84.658] + - - [1856, 2368, 1, 256] + - [13, 69.499] + - - [3584, 6784, 1, 3328] + - [24, 90.433] + - - [1024, 5888, 1, 3328] + - [30, 81.441] + - - [6144, 24000, 1, 2560] + - [15, 91.588] + - - [5056, 4288, 1, 1280] + - [24, 87.681] + - - [6784, 1856, 1, 3328] + - [49, 82.93] + - - [1408, 5056, 1, 1280] + - [49, 80.764] + - - [2368, 2368, 1, 1280] + - [49, 75.49] + - - [2944, 5888, 1, 128] + - [41, 83.517] + - - [704, 5888, 1, 1280] + - [32, 77.647] + - - [2368, 3584, 1, 128] + - [5, 75.707] + - - [1856, 5056, 1, 128] + - [46, 75.31] + - - [8192, 3200, 1, 2048] + - [13, 88.087] + - - [1024, 5056, 1, 1280] + - [13, 82.474] + - - [4288, 1024, 1, 256] + - [13, 73.649] + - - [2944, 2368, 1, 128] + - [41, 73.735] + - - [5888, 448, 1, 1280] + - [48, 64.509] + - - [704, 5888, 1, 3328] + - [32, 79.303] + - - [3584, 2944, 1, 256] + - [49, 83.99] + - - [512, 24000, 1, 2048] + - [49, 87.261] + - - [1408, 5056, 1, 3328] + - [49, 81.658] + - - [1856, 1856, 1, 3328] + - [30, 71.953] + - - [2560, 800, 1, 2560] + - [13, 71.985] + - - [2368, 2368, 1, 256] + - [30, 72.413] + - - [4288, 4288, 1, 1280] + - [30, 84.369] + - - [5888, 1024, 1, 1280] + - [30, 80.674] + - - [1408, 4288, 1, 256] + - [30, 76.212] + - - [5888, 448, 1, 128] + - [12, 60.164] + - - [512, 48000, 1, 2560] + - [32, 90.956] + - - [704, 6784, 1, 3328] + - [49, 78.206] + - - [2560, 6400, 1, 2560] + - [32, 90.573] + - - [5056, 1024, 1, 1280] + - [30, 83.047] + - - [448, 5888, 1, 3328] + - [15, 67.031] + - - [1024, 2944, 1, 1280] + - [32, 74.132] + - - [5056, 5888, 1, 1280] + - [7, 88.398] + - - [4288, 5888, 1, 128] + - [46, 82.127] + - - [1408, 3584, 1, 128] + - [26, 74.457] + - - [448, 3584, 1, 128] + - [8, 52.444] + - - [5888, 2944, 1, 1280] + - [49, 87.365] + - - [2368, 5888, 1, 128] + - [22, 78.68] + - - [3584, 5888, 1, 256] + - [13, 86.738] + - - [2368, 1024, 1, 128] + - [27, 58.788] + - - [2368, 704, 1, 128] + - [12, 54.763] + - - [3584, 2944, 1, 1280] + - [13, 87.564] + - - [3584, 2368, 1, 128] + - [41, 75.752] + - - [5056, 704, 1, 128] + - [45, 64.558] + - - [5056, 1408, 1, 3328] + - [13, 81.829] + - - [6784, 1024, 1, 3328] + - [15, 86.738] + - - [6784, 2944, 1, 3328] + - [30, 89.012] + - - [2944, 5056, 1, 3328] + - [32, 88.75] + - - [1856, 1856, 1, 256] + - [47, 64.928] + - - [1024, 5888, 1, 128] + - [9, 74.592] + - - [6784, 2368, 1, 1280] + - [13, 86.273] + - - [4288, 5888, 1, 1280] + - [24, 88.403] + - - [4288, 4288, 1, 256] + - [13, 81.401] + - - [4288, 1856, 1, 1280] + - [49, 79.018] + - - [1856, 2944, 1, 3328] + - [13, 80.967] + - - [256, 6784, 1, 3328] + - [14, 75.81] + - - [256, 5056, 1, 128] + - [8, 55.616] + - - [5056, 1024, 1, 256] + - [13, 77.678] + - - [5056, 1856, 1, 3328] + - [30, 82.781] + - - [1856, 1408, 1, 256] + - [47, 64.436] + - - [8448, 12000, 1, 2816] + - [7, 91.719] + - - [4288, 1408, 1, 128] + - [10, 72.07] + - - [1856, 5888, 1, 3328] + - [51, 85.01] + - - [4288, 5056, 1, 256] + - [15, 83.151] + - - [4096, 800, 1, 1024] + - [13, 66.886] + - - [5056, 256, 1, 3328] + - [12, 61.03] + - - [1024, 5888, 1, 1280] + - [30, 80.9] + - - [6784, 2368, 1, 128] + - [9, 79.659] + - - [1856, 1024, 1, 1280] + - [43, 69.255] + - - [6784, 4288, 1, 1280] + - [7, 87.234] + - - [1856, 1856, 1, 1280] + - [30, 70.239] + - - [4096, 400, 1, 1024] + - [32, 57.827] + - - [3072, 24000, 1, 1024] + - [30, 88.52] + - - [5888, 1856, 1, 128] + - [47, 74.177] + - - [5056, 3584, 1, 128] + - [46, 81.022] + - - [5888, 5888, 1, 3328] + - [24, 90.735] + - - [6784, 1024, 1, 256] + - [13, 80.3] + - - [2944, 2368, 1, 256] + - [30, 75.878] + - - [5056, 5888, 1, 3328] + - [24, 89.233] + - - [1856, 1024, 1, 256] + - [27, 58.287] + - - [512, 48000, 1, 1536] + - [32, 90.289] + - - [3584, 448, 1, 1280] + - [12, 62.993] + - - [8448, 5984, 1, 2816] + - [13, 88.935] + - - [448, 5888, 1, 256] + - [10, 58.977] + - - [1408, 6784, 1, 3328] + - [49, 84.437] + - - [4288, 704, 1, 128] + - [47, 61.82] + - - [5056, 2944, 1, 256] + - [30, 83.052] + - - [6784, 5888, 1, 128] + - [49, 83.909] + - - [2944, 704, 1, 128] + - [47, 59.843] + - - [1408, 3584, 1, 3328] + - [30, 82.853] + - - [2368, 6784, 1, 256] + - [49, 81.694] + - - [5056, 1408, 1, 1280] + - [13, 81.098] + - - [5056, 4288, 1, 128] + - [22, 81.789] + - - [1408, 1856, 1, 128] + - [12, 64.166] + - - [1408, 5888, 1, 3328] + - [49, 83.138] + - - [6784, 6784, 1, 256] + - [30, 86.878] + - - [4288, 2368, 1, 128] + - [27, 75.468] + - - [2368, 2944, 1, 256] + - [30, 74.624] + - - [3584, 1856, 1, 1280] + - [49, 81.694] + - - [6784, 6784, 1, 128] + - [13, 83.968] + - - [5888, 5056, 1, 256] + - [13, 84.757] + - - [8448, 48000, 1, 2816] + - [7, 92.193] + - - [3584, 448, 1, 256] + - [48, 57.737] + - - [448, 4288, 1, 128] + - [8, 55.747] + - - [256, 6784, 1, 256] + - [10, 70.433] + - - [1408, 4288, 1, 128] + - [27, 72.278] + - - [2944, 704, 1, 3328] + - [32, 77.606] + - - [5056, 256, 1, 1280] + - [48, 59.541] + - - [3584, 3584, 1, 256] + - [49, 85.452] + - - [3584, 5056, 1, 256] + - [30, 83.485] + - - [2944, 2368, 1, 1280] + - [49, 79.727] + - - [1408, 3584, 1, 256] + - [30, 76.934] + - - [6784, 3584, 1, 256] + - [24, 86.864] + - - [5056, 2368, 1, 128] + - [46, 77.322] + - - [2944, 2944, 1, 3328] + - [13, 86.431] + - - [5056, 6784, 1, 256] + - [30, 85.957] + - - [1856, 3584, 1, 128] + - [27, 72.616] + - - [6784, 448, 1, 256] + - [30, 66.056] + - - [3584, 6784, 1, 128] + - [41, 84.82] + - - [5056, 1856, 1, 256] + - [30, 79.253] + - - [4608, 5984, 1, 1536] + - [32, 88.263] + - - [1760, 3200, 1, 1760] + - [18, 86.183] + - - [1024, 1856, 1, 256] + - [12, 62.257] + - - [4096, 1600, 1, 1024] + - [13, 79.546] + - - [1408, 6784, 1, 1280] + - [30, 84.026] + - - [3584, 3584, 1, 1280] + - [32, 89.896] + - - [7680, 24000, 1, 2560] + - [32, 92.098] + - - [4608, 48000, 1, 1536] + - [32, 91.863] + - - [5888, 5888, 1, 128] + - [49, 84.843] + - - [5056, 2368, 1, 3328] + - [30, 82.136] + - - [2944, 4288, 1, 256] + - [30, 81.622] + - - [1408, 3584, 1, 1280] + - [49, 82.276] + - - [8192, 1600, 1, 2048] + - [13, 82.673] + - - [512, 24000, 1, 2560] + - [30, 87.83] + - - [2368, 6784, 1, 3328] + - [13, 86.738] + - - [1856, 1408, 1, 1280] + - [13, 72.057] + - - [6784, 704, 1, 128] + - [46, 67.401] + - - [1408, 5888, 1, 256] + - [49, 79.889] + - - [704, 2944, 1, 1280] + - [15, 74.854] + - - [704, 6784, 1, 128] + - [27, 68.574] + - - [3584, 704, 1, 1280] + - [13, 70.46] + - - [5888, 2368, 1, 256] + - [49, 81.333] + - - [2944, 6784, 1, 128] + - [22, 85.096] + - - [3584, 448, 1, 3328] + - [12, 64.238] + - - [704, 2368, 1, 3328] + - [15, 62.74] + - - [256, 5888, 1, 128] + - [27, 58.188] + - - [2048, 3200, 1, 512] + - [30, 79.871] + - - [2944, 2944, 1, 1280] + - [49, 86.088] + - - [5056, 448, 1, 3328] + - [12, 68.619] + - - [6784, 704, 1, 3328] + - [49, 78.188] + - - [5888, 4288, 1, 128] + - [37, 81.328] + - - [1408, 2944, 1, 3328] + - [32, 79.257] + - - [3584, 704, 1, 128] + - [12, 61.328] + - - [4608, 12000, 1, 1536] + - [32, 91.132] + - - [5056, 5056, 1, 128] + - [26, 81.707] + - - [8192, 800, 1, 2048] + - [13, 74.899] + - - [448, 5056, 1, 128] + - [8, 56.139] + - - [5056, 3584, 1, 256] + - [13, 83.219] + - - [1408, 5056, 1, 128] + - [46, 74.994] + - - [2944, 3584, 1, 128] + - [22, 82.411] + - - [3584, 2368, 1, 256] + - [30, 80.214] + - - [8448, 24000, 1, 2816] + - [7, 92.138] + - - [3584, 3584, 1, 3328] + - [51, 90.704] + - - [5888, 6784, 1, 256] + - [13, 86.449] + - - [4288, 2944, 1, 3328] + - [30, 86.476] + - - [256, 5056, 1, 1280] + - [48, 59.979] + - - [2944, 5888, 1, 3328] + - [24, 90.162] + - - [6784, 5888, 1, 1280] + - [32, 90.383] + - - [2048, 800, 1, 512] + - [12, 58.806] + - - [5888, 4288, 1, 1280] + - [24, 88.538] + - - [1024, 24000, 1, 2048] + - [15, 89.102] + - - [5888, 3584, 1, 128] + - [41, 82.871] + - - [1024, 2944, 1, 128] + - [8, 66.652] + - - [704, 3584, 1, 128] + - [47, 60.61] + - - [5888, 448, 1, 3328] + - [12, 65.032] + - - [2368, 4288, 1, 1280] + - [13, 83.819] + - - [4288, 2944, 1, 128] + - [13, 78.712] + - - [1024, 6784, 1, 3328] + - [15, 88.782] + - - [5056, 2944, 1, 3328] + - [13, 86.769] + - - [2944, 3584, 1, 256] + - [30, 83.494] + - - [1408, 1408, 1, 3328] + - [51, 74.651] + - - [3584, 3584, 1, 128] + - [41, 82.294] + - - [3584, 704, 1, 256] + - [29, 65.077] + - - [3584, 1408, 1, 3328] + - [30, 82.921] + - - [704, 3584, 1, 1280] + - [49, 70.802] + - - [2944, 6784, 1, 1280] + - [13, 88.592] + - - [1856, 6784, 1, 256] + - [30, 78.518] + - - [4288, 448, 1, 3328] + - [32, 71.407] + - - [6784, 4288, 1, 128] + - [46, 83.016] + - - [6784, 704, 1, 1280] + - [30, 77.114] + - - [3584, 6784, 1, 256] + - [30, 85.723] + - - [6144, 12000, 1, 2048] + - [13, 89.282] + - - [5888, 1024, 1, 3328] + - [30, 81.41] + - - [704, 6784, 1, 1280] + - [49, 76.943] + - - [1856, 5056, 1, 3328] + - [49, 82.894] + - - [1024, 3584, 1, 128] + - [8, 66.48] + - - [1024, 1408, 1, 128] + - [8, 61.044] + - - [2368, 2944, 1, 128] + - [27, 72.201] + - - [5056, 2944, 1, 128] + - [5, 80.593] + - - [5888, 5056, 1, 3328] + - [7, 89.738] + - - [5888, 2368, 1, 128] + - [46, 77.096] + - - [3584, 6784, 1, 1280] + - [7, 88.741] + - - [1856, 5888, 1, 256] + - [15, 77.236] + - - [4288, 4288, 1, 3328] + - [49, 84.856] + - - [4288, 1408, 1, 1280] + - [30, 80.196] + - - [3584, 5056, 1, 128] + - [41, 80.746] + - - [4288, 2368, 1, 256] + - [30, 80.44] + - - [2944, 5056, 1, 1280] + - [15, 87.771] + - - [448, 6784, 1, 256] + - [10, 65.835] + - - [1856, 2368, 1, 128] + - [47, 69.273] + - - [6784, 2368, 1, 3328] + - [13, 86.684] + - - [4288, 1856, 1, 3328] + - [49, 79.415] + - - [3584, 448, 1, 128] + - [25, 57.664] + - - [2048, 1600, 1, 2048] + - [13, 78.373] + - - [3584, 1024, 1, 1280] + - [30, 76.131] + - - [1856, 5056, 1, 256] + - [49, 78.278] + - - [1024, 4288, 1, 256] + - [13, 71.976] + - - [5888, 3584, 1, 3328] + - [51, 91.191] + - - [5056, 3584, 1, 3328] + - [43, 87.509] + - - [2368, 1408, 1, 1280] + - [13, 78.251] + - - [5056, 2944, 1, 1280] + - [13, 86.129] + - - [1024, 6784, 1, 256] + - [30, 81.676] + - - [5124, 9124, 1, 2048] + - [32, 87.081] + - - [2944, 1408, 1, 128] + - [47, 68.709] + - - [3584, 1408, 1, 1280] + - [30, 82.168] + - - [5056, 6784, 1, 3328] + - [7, 89.192] + - - [3584, 4288, 1, 256] + - [49, 82.407] + - - [1856, 6784, 1, 3328] + - [30, 82.962] + - - [5888, 4288, 1, 256] + - [13, 84.401] + - - [5056, 1408, 1, 256] + - [13, 76.614] + - - [3584, 1024, 1, 256] + - [49, 72.684] + - - [5888, 5888, 1, 256] + - [49, 86.666] + - - [4288, 1024, 1, 1280] + - [15, 79.145] + - - [448, 6784, 1, 3328] + - [51, 76.487] + - - [2944, 1408, 1, 1280] + - [51, 77.886] + - - [2944, 1856, 1, 3328] + - [13, 80.791] + - - [3584, 5888, 1, 1280] + - [32, 90.771] + - - [6784, 1856, 1, 1280] + - [13, 82.465] + - - [2944, 5056, 1, 256] + - [30, 83.449] + - - [5888, 256, 1, 3328] + - [49, 69.052] + - - [2944, 4288, 1, 128] + - [26, 79.19] + - - [3584, 1408, 1, 256] + - [49, 78.594] + - - [704, 3584, 1, 3328] + - [49, 72.314] + - - [4096, 3200, 1, 1024] + - [13, 85.497] + - - [5056, 448, 1, 1280] + - [12, 67.586] + - - [3584, 1856, 1, 3328] + - [49, 82.411] + - - [4288, 6784, 1, 1280] + - [7, 87.0] + - - [2560, 7000, 1, 2560] + - [30, 87.469] + - - [2944, 1024, 1, 256] + - [26, 65.375] + - - [2368, 4288, 1, 3328] + - [49, 84.667] + - - [1024, 1408, 1, 1280] + - [12, 66.503] + - - [6784, 5056, 1, 256] + - [13, 85.339] + - - [1856, 1856, 1, 128] + - [47, 61.436] + - - [3584, 5056, 1, 3328] + - [7, 87.518] + - - [448, 6784, 1, 128] + - [47, 60.944] + - - [2944, 6784, 1, 256] + - [30, 86.35] + - - [2944, 2944, 1, 128] + - [41, 80.367] + - - [1856, 3584, 1, 1280] + - [51, 84.062] + - - [4288, 448, 1, 128] + - [12, 56.834] + - - [4608, 24000, 1, 1536] + - [32, 91.629] + - - [1856, 1408, 1, 3328] + - [13, 74.543] + - - [1024, 4288, 1, 3328] + - [32, 83.796] + - - [5056, 448, 1, 256] + - [12, 63.313] + - - [2944, 2368, 1, 3328] + - [30, 80.088] + - - [704, 4288, 1, 3328] + - [15, 76.275] + - - [1024, 1856, 1, 1280] + - [43, 69.187] + - - [2048, 6400, 1, 2048] + - [13, 86.093] + - - [512, 48000, 1, 2816] + - [24, 91.168] + - - [5124, 9124, 1, 2560] + - [15, 87.428] + - - [1024, 5888, 1, 256] + - [13, 75.973] + - - [1408, 2368, 1, 256] + - [30, 70.017] + - - [1408, 1408, 1, 256] + - [29, 64.283] + - - [2368, 2368, 1, 128] + - [46, 70.306] + - - [6784, 1408, 1, 128] + - [41, 79.438] + - - [4288, 5888, 1, 256] + - [15, 85.263] + - - [1408, 5056, 1, 256] + - [49, 76.844] + - - [4288, 3584, 1, 128] + - [5, 80.015] + - - [3584, 5056, 1, 1280] + - [32, 86.688] + - - [1856, 1024, 1, 128] + - [27, 55.156] + - - [1024, 24000, 1, 1536] + - [15, 89.517] + - - [704, 4288, 1, 256] + - [47, 63.669] + - - [5888, 2368, 1, 1280] + - [13, 84.148] + - - [2368, 5888, 1, 1280] + - [30, 84.076] + - - [5888, 256, 1, 1280] + - [30, 67.965] + - - [2368, 1856, 1, 3328] + - [49, 80.782] + - - [2944, 704, 1, 256] + - [51, 65.479] + - - [2368, 1024, 1, 3328] + - [30, 69.738] + - - [704, 3584, 1, 256] + - [27, 64.504] + - - [704, 2944, 1, 3328] + - [32, 77.43] + - - [6784, 1024, 1, 128] + - [46, 79.474] + - - [2944, 1024, 1, 3328] + - [13, 73.153] + - - [2944, 5056, 1, 128] + - [22, 80.76] + - - [1408, 6784, 1, 256] + - [49, 81.734] + - - [6784, 1408, 1, 3328] + - [49, 84.401] + - - [4288, 6784, 1, 128] + - [46, 82.934] + - - [6784, 2944, 1, 1280] + - [30, 88.664] + - - [4288, 1856, 1, 128] + - [26, 71.479] + - - [1856, 2944, 1, 128] + - [27, 71.488] + - - [6784, 448, 1, 128] + - [21, 62.596] + - - [448, 5056, 1, 1280] + - [30, 63.827] + - - [2368, 1856, 1, 128] + - [47, 69.575] + - - [4288, 704, 1, 256] + - [30, 65.443] + - - [5888, 704, 1, 256] + - [30, 69.345] + - - [3584, 1024, 1, 128] + - [27, 68.903] + - - [256, 5888, 1, 3328] + - [49, 69.485] + - - [1408, 4288, 1, 3328] + - [49, 81.567] + - - [6784, 4288, 1, 256] + - [30, 84.649] + - - [5888, 256, 1, 256] + - [47, 61.432] + - - [6784, 1024, 1, 1280] + - [32, 85.885] + - - [5888, 1024, 1, 128] + - [49, 74.655] + - - [6784, 3584, 1, 1280] + - [32, 89.928] + - - [1024, 6784, 1, 1280] + - [32, 86.35] + - - [1408, 2944, 1, 1280] + - [32, 77.782] + - - [2048, 800, 1, 2048] + - [12, 64.17] + - - [1408, 2368, 1, 3328] + - [13, 80.57] + - - [2944, 1856, 1, 128] + - [27, 71.619] + - - [256, 6784, 1, 128] + - [27, 65.465] + - - [5056, 6784, 1, 128] + - [22, 83.291] + - - [4288, 5056, 1, 128] + - [22, 80.25] + - - [1856, 5888, 1, 128] + - [30, 75.797] + - - [2944, 5888, 1, 256] + - [15, 85.56] + - - [3584, 1856, 1, 256] + - [49, 77.372] + - - [4288, 3584, 1, 1280] + - [30, 85.89] + - - [704, 5888, 1, 128] + - [10, 67.053] + - - [6784, 3584, 1, 128] + - [46, 84.189] + - - [4288, 5056, 1, 3328] + - [7, 89.125] + - - [1408, 1408, 1, 128] + - [47, 59.257] + - - [5056, 2368, 1, 256] + - [13, 79.42] + - - [4288, 704, 1, 3328] + - [13, 72.946] + - - [448, 3584, 1, 256] + - [26, 53.32] + - - [2368, 1024, 1, 1280] + - [30, 67.938] + - - [2944, 1408, 1, 3328] + - [51, 79.366] + - - [1024, 1408, 1, 3328] + - [29, 67.929] + - - [2944, 5888, 1, 1280] + - [15, 89.391] + - - [5888, 3584, 1, 256] + - [13, 86.35] + - - [2368, 5056, 1, 128] + - [22, 75.829] + - - [1408, 1856, 1, 3328] + - [30, 74.881] + - - [6784, 1408, 1, 1280] + - [13, 84.004] + - - [4096, 7000, 1, 4096] + - [13, 86.711] + - - [704, 2944, 1, 256] + - [27, 64.418] + - - [6784, 5888, 1, 3328] + - [7, 90.889] + - - [2368, 4288, 1, 128] + - [27, 75.391] + - - [1024, 6784, 1, 128] + - [5, 77.958] + - - [1408, 1408, 1, 1280] + - [51, 72.932] + - - [16384, 400, 1, 4096] + - [32, 60.719] + - - [448, 4288, 1, 3328] + - [51, 71.574] + - - [2368, 1408, 1, 256] + - [30, 71.633] + - - [5888, 5056, 1, 128] + - [13, 81.969] + - - [704, 2368, 1, 256] + - [27, 53.784] + - - [1024, 24000, 1, 2560] + - [15, 90.803] + - - [5888, 2368, 1, 3328] + - [13, 84.527] + - - [5124, 9124, 1, 1760] + - [37, 89.973] + - - [4288, 448, 1, 1280] + - [32, 68.127] + - - [5888, 704, 1, 3328] + - [30, 76.032] + - - [5056, 256, 1, 128] + - [8, 51.267] + - - [1408, 5888, 1, 128] + - [22, 77.976] + - - [7680, 12000, 1, 2560] + - [15, 91.462] + - - [1408, 1024, 1, 256] + - [29, 59.221] + - - [8192, 400, 1, 2048] + - [12, 61.707] + - - [1024, 1856, 1, 128] + - [8, 62.366] + - - [5056, 6784, 1, 1280] + - [24, 88.592] + - - [704, 5056, 1, 3328] + - [13, 74.222] + - - [2368, 2944, 1, 3328] + - [49, 80.088] + - - [2368, 3584, 1, 256] + - [13, 78.558] + - - [5056, 3584, 1, 1280] + - [32, 86.891] + - - [5124, 9124, 1, 4096] + - [32, 87.099] + - - [7680, 48000, 1, 2560] + - [32, 92.143] + - - [1856, 2944, 1, 1280] + - [49, 79.654] + - - [1024, 48000, 1, 2816] + - [7, 91.633] + - - [2944, 1408, 1, 256] + - [30, 69.647] + - - [4288, 1408, 1, 3328] + - [30, 81.54] + - - [5888, 2944, 1, 128] + - [41, 81.892] + - - [2944, 1024, 1, 128] + - [27, 65.501] + - - [4288, 5056, 1, 1280] + - [15, 88.168] + - - [5888, 6784, 1, 1280] + - [49, 89.4] + - - [6784, 5056, 1, 128] + - [46, 82.673] + - - [1760, 1600, 1, 1760] + - [17, 76.483] + - - [5888, 1408, 1, 3328] + - [49, 83.183] + - - [2368, 1856, 1, 256] + - [30, 73.04] + - - [256, 5056, 1, 256] + - [29, 56.712] + - - [448, 3584, 1, 3328] + - [15, 60.755] + - - [704, 2368, 1, 128] + - [25, 55.778] + - - [5888, 256, 1, 128] + - [9, 55.873] + - - [3584, 1856, 1, 128] + - [41, 73.424] + - - [4288, 4288, 1, 128] + - [46, 80.48] + - - [1856, 1024, 1, 3328] + - [43, 71.533] + - - [1024, 5056, 1, 256] + - [49, 76.528] + - - [5888, 5888, 1, 1280] + - [7, 90.009] + - - [5056, 5888, 1, 128] + - [26, 83.674] + - - [2368, 1408, 1, 3328] + - [13, 80.385] + - - [1024, 48000, 1, 1536] + - [15, 90.356] + - - [5888, 448, 1, 256] + - [12, 60.448] + - - [2560, 3200, 1, 2560] + - [13, 87.279] + - - [5888, 6784, 1, 128] + - [49, 82.19] + - - [6144, 48000, 1, 2048] + - [13, 89.887] + - - [6784, 5056, 1, 1280] + - [7, 89.003] + - - [5056, 704, 1, 1280] + - [49, 72.603] + - - [1024, 48000, 1, 2560] + - [32, 91.367] + - - [1024, 2368, 1, 128] + - [8, 62.46] + - - [16384, 800, 1, 4096] + - [41, 70.617] + - - [5888, 5056, 1, 1280] + - [7, 88.845] + - - [3072, 48000, 1, 1024] + - [30, 89.377] + - - [6784, 1408, 1, 256] + - [30, 80.363] + - - [3584, 5888, 1, 128] + - [41, 84.843] + - - [5056, 5888, 1, 256] + - [13, 84.811] + - - [2368, 1024, 1, 256] + - [14, 60.412] + - - [2944, 1856, 1, 256] + - [30, 74.953] + - - [1856, 6784, 1, 1280] + - [30, 82.556] + - - [4288, 3584, 1, 256] + - [13, 82.759] + - - [6784, 448, 1, 3328] + - [13, 73.234] + - - [5056, 1856, 1, 1280] + - [30, 82.181] + - - [1408, 1024, 1, 3328] + - [29, 67.897] + - - [5888, 3584, 1, 1280] + - [51, 90.627] + - - [1856, 3584, 1, 3328] + - [15, 85.461] + - - [1024, 2944, 1, 256] + - [13, 68.059] + - - [448, 6784, 1, 1280] + - [32, 74.953] + - - [704, 5056, 1, 256] + - [49, 67.527] + - - [3584, 1024, 1, 3328] + - [30, 76.826] + - - [2944, 1856, 1, 1280] + - [49, 80.065] + - - [5056, 256, 1, 256] + - [12, 54.321] + - - [2944, 4288, 1, 3328] + - [51, 88.732] + - - [2368, 3584, 1, 3328] + - [49, 84.694] + - - [2944, 704, 1, 1280] + - [32, 75.391] + - - [2944, 3584, 1, 1280] + - [49, 87.658] + - - [1856, 5888, 1, 1280] + - [15, 84.144] + - - [2048, 3200, 1, 2048] + - [15, 83.48] + - - [4288, 1408, 1, 256] + - [13, 76.397] + - - [5888, 1408, 1, 128] + - [49, 77.85] + - - [4288, 2368, 1, 1280] + - [49, 84.135] + - - [6784, 2368, 1, 256] + - [13, 82.781] + - - [1024, 24000, 1, 2816] + - [7, 91.119] + - - [7680, 5984, 1, 2560] + - [32, 89.422] + - - [4288, 1856, 1, 256] + - [30, 74.155] + - - [1856, 2944, 1, 256] + - [30, 74.935] + - - [5056, 1024, 1, 128] + - [9, 74.822] + - - [1760, 800, 1, 1760] + - [18, 66.304] + - - [6784, 256, 1, 128] + - [27, 66.155] + - - [5888, 704, 1, 128] + - [47, 65.298] + - - [1408, 2368, 1, 128] + - [27, 68.917] + - - [1024, 4288, 1, 1280] + - [32, 81.432] + - - [2368, 5056, 1, 3328] + - [32, 84.468] + - - [4288, 1024, 1, 3328] + - [30, 80.606] + - - [6144, 48000, 1, 2560] + - [15, 91.886] + - - [1024, 5056, 1, 3328] + - [30, 84.595] + - - [1024, 1856, 1, 3328] + - [32, 71.353] + - - [4288, 6784, 1, 256] + - [13, 84.491] + - - [3584, 2944, 1, 3328] + - [13, 87.965] + - - [5888, 2944, 1, 256] + - [13, 84.432] + - - [448, 4288, 1, 1280] + - [51, 69.616] + - - [1024, 4288, 1, 128] + - [8, 70.437] + - - [5056, 4288, 1, 256] + - [30, 84.035] + - - [1024, 3584, 1, 256] + - [13, 72.26] + - - [6784, 6784, 1, 3328] + - [7, 89.734] + - - [448, 5888, 1, 1280] + - [15, 65.127] + - - [5056, 448, 1, 128] + - [25, 62.889] + - - [4288, 704, 1, 1280] + - [49, 71.66] + - - [3584, 2944, 1, 128] + - [49, 81.504] + - - [6784, 256, 1, 1280] + - [50, 74.949] + - - [2368, 5888, 1, 3328] + - [30, 84.396] + - - [2368, 1856, 1, 1280] + - [49, 78.915] + - - [448, 5056, 1, 3328] + - [49, 65.095] + - - [3584, 4288, 1, 128] + - [49, 79.411] + - - [5888, 4288, 1, 3328] + - [7, 89.63] + - - [2368, 704, 1, 256] + - [27, 51.731] + - - [3584, 1408, 1, 128] + - [41, 74.231] + - - [1856, 5056, 1, 1280] + - [49, 82.371] + - - [2944, 1024, 1, 1280] + - [49, 72.427] + - - [3584, 5888, 1, 3328] + - [24, 91.507] + - - [2368, 4288, 1, 256] + - [30, 78.996] + - - [1024, 2368, 1, 3328] + - [30, 69.693] + - - [1024, 3584, 1, 1280] + - [30, 76.189] + - - [4288, 5888, 1, 3328] + - [32, 89.116] + - - [1024, 2944, 1, 3328] + - [32, 75.995] + - - [6784, 1856, 1, 256] + - [13, 80.074] + - - [256, 6784, 1, 1280] + - [31, 74.89] + - - [1856, 3584, 1, 256] + - [24, 76.844] + - - [6784, 1856, 1, 128] + - [9, 77.85] + - - [512, 24000, 1, 2816] + - [49, 87.87] + - - [256, 5888, 1, 1280] + - [49, 67.004] + - - [16384, 1600, 1, 4096] + - [51, 77.751] + - - [2368, 1408, 1, 128] + - [47, 68.474] + - - [1408, 1024, 1, 128] + - [40, 57.583] + - - [6784, 3584, 1, 3328] + - [24, 90.618] + - - [1760, 7000, 1, 1760] + - [37, 87.591] + - - [2368, 5056, 1, 1280] + - [32, 83.841] + - - [1408, 2368, 1, 1280] + - [30, 78.797] + - - [704, 4288, 1, 128] + - [27, 61.996] + - - [2944, 2944, 1, 256] + - [30, 83.449] + - - [6784, 256, 1, 256] + - [47, 72.314] + - - [256, 5056, 1, 3328] + - [29, 61.292] + - - [5056, 1856, 1, 128] + - [26, 76.695] + - - [5056, 1024, 1, 3328] + - [30, 84.563] + - - [4288, 3584, 1, 3328] + - [49, 86.16] + - - [1024, 2368, 1, 1280] + - [49, 67.31] + - - [5888, 6784, 1, 3328] + - [13, 89.765] + - - [704, 4288, 1, 1280] + - [15, 73.347] + - - [1024, 48000, 1, 2048] + - [32, 90.307] + - - [4288, 1024, 1, 128] + - [27, 69.666] + - - [4096, 512, 1, 32] + - [0, 43.628] + - - [2048, 1024, 1, 1664] + - [24, 78.518] + - - [4096, 512, 1, 1408] + - [24, 77.714] + - - [4096, 1024, 1, 1280] + - [30, 76.781] + - - [2048, 1024, 1, 640] + - [43, 75.621] + - - [4096, 1024, 1, 13312] + - [49, 74.29] + - - [2048, 1024, 1, 13312] + - [13, 71.912] + - - [2048, 1024, 1, 3584] + - [15, 78.536] + - - [4096, 1024, 1, 1920] + - [46, 78.134] + - - [4096, 1024, 1, 12288] + - [32, 72.932] + - - [4096, 1024, 1, 8320] + - [24, 77.723] + - - [4096, 1024, 1, 15360] + - [49, 72.377] + - - [4096, 512, 1, 3072] + - [15, 78.238] + - - [4096, 512, 1, 13312] + - [13, 66.074] + - - [4096, 1024, 1, 3840] + - [49, 77.295] + - - [2048, 1024, 1, 3200] + - [24, 79.411] + - - [4096, 512, 1, 3840] + - [43, 79.285] + - - [4096, 512, 1, 5632] + - [32, 79.501] + - - [4096, 512, 1, 64] + - [0, 57.127] + - - [2048, 1024, 1, 512] + - [32, 72.846] + - - [4096, 512, 1, 8192] + - [15, 73.997] + - - [4096, 512, 1, 2304] + - [15, 77.723] + - - [4096, 512, 1, 2816] + - [15, 78.757] + - - [2048, 1024, 1, 7680] + - [15, 79.596] + - - [4096, 512, 1, 1920] + - [24, 78.432] + - - [4096, 1024, 1, 32] + - [8, 54.163] + - - [4096, 512, 1, 16640] + - [51, 79.812] + - - [2048, 1024, 1, 1024] + - [51, 75.355] + - - [4096, 512, 1, 1792] + - [51, 77.778] + - - [4096, 1024, 1, 8192] + - [32, 70.405] + - - [2048, 1024, 1, 4160] + - [43, 79.375] + - - [4096, 512, 1, 10240] + - [51, 71.154] + - - [4096, 512, 1, 512] + - [28, 70.825] + - - [2048, 1024, 1, 6656] + - [15, 79.406] + - - [2048, 1024, 1, 14336] + - [13, 71.741] + - - [4096, 512, 1, 11264] + - [13, 68.565] + - - [4096, 512, 1, 128] + - [27, 60.859] + - - [4096, 512, 1, 768] + - [15, 75.332] + - - [4096, 1024, 1, 11264] + - [49, 72.413] + - - [4096, 1024, 1, 16640] + - [49, 77.683] + - - [2048, 1024, 1, 5632] + - [15, 79.122] + - - [4096, 512, 1, 12288] + - [14, 64.477] + - - [4096, 1024, 1, 5632] + - [30, 77.466] + - - [2048, 1024, 1, 10240] + - [15, 79.406] + - - [4096, 1024, 1, 640] + - [46, 76.208] + - - [2048, 1024, 1, 12288] + - [15, 76.411] + - - [4096, 1024, 1, 10240] + - [51, 73.379] + - - [2048, 1024, 1, 4608] + - [15, 78.869] + - - [4096, 512, 1, 3584] + - [15, 78.842] + - - [4096, 1024, 1, 4608] + - [30, 77.444] + - - [4096, 1024, 1, 3328] + - [30, 77.448] + - - [2048, 1024, 1, 9216] + - [15, 79.682] + - - [2048, 1024, 1, 2304] + - [32, 78.54] + - - [4096, 512, 1, 6144] + - [15, 79.289] + - - [4096, 512, 1, 15360] + - [30, 66.683] + - - [4096, 1024, 1, 7168] + - [15, 77.538] + - - [4096, 1024, 1, 9216] + - [32, 72.81] + - - [4096, 1024, 1, 7680] + - [32, 77.39] + - - [2048, 1024, 1, 8192] + - [15, 78.712] + - - [4096, 1024, 1, 64] + - [0, 66.11] + - - [2048, 1024, 1, 1280] + - [24, 77.597] + - - [2048, 1024, 1, 3328] + - [51, 79.154] + - - [4096, 512, 1, 14336] + - [30, 68.05] + - - [4096, 512, 1, 8320] + - [24, 79.668] + - - [4096, 1024, 1, 6656] + - [30, 77.538] + - - [2048, 1024, 1, 256] + - [11, 64.87] + - - [4096, 512, 1, 1024] + - [15, 76.302] + - - [4096, 1024, 1, 1536] + - [30, 76.812] + - - [2048, 1024, 1, 32] + - [0, 40.696] + - - [4096, 512, 1, 640] + - [15, 74.89] + - - [4096, 512, 1, 16384] + - [51, 68.659] + - - [4096, 1024, 1, 512] + - [30, 74.186] + - - [2048, 1024, 1, 1152] + - [43, 77.936] + - - [4096, 1024, 1, 2080] + - [37, 80.246] + - - [4096, 1024, 1, 768] + - [30, 76.162] + - - [4096, 1024, 1, 2560] + - [30, 77.029] + - - [2048, 1024, 1, 64] + - [8, 58.562] + - - [4096, 1024, 1, 16384] + - [51, 69.219] + - - [4096, 512, 1, 6656] + - [32, 79.393] + - - [2048, 1024, 1, 128] + - [10, 60.556] + - - [2048, 1024, 1, 2080] + - [43, 78.887] + - - [2048, 1024, 1, 16640] + - [15, 79.551] + - - [2048, 1024, 1, 3072] + - [15, 78.202] + - - [4096, 1024, 1, 1408] + - [26, 77.976] + - - [4096, 1024, 1, 2048] + - [13, 76.817] + - - [2048, 1024, 1, 2560] + - [51, 78.567] + - - [4096, 1024, 1, 128] + - [9, 69.129] + - - [4096, 1024, 1, 14336] + - [51, 74.286] + - - [4096, 512, 1, 9216] + - [13, 72.192] + - - [2048, 1024, 1, 2048] + - [51, 77.507] + - - [4096, 512, 1, 1536] + - [51, 77.403] + - - [2048, 1024, 1, 16384] + - [32, 71.105] + - - [4096, 1024, 1, 1024] + - [13, 75.287] + - - [4096, 1024, 1, 1664] + - [26, 78.454] + - - [4096, 512, 1, 384] + - [7, 71.691] + - - [4096, 512, 1, 3328] + - [15, 78.901] + - - [4096, 1024, 1, 256] + - [13, 73.203] + - - [2048, 1024, 1, 7168] + - [15, 79.618] + - - [2048, 1024, 1, 1536] + - [32, 77.629] + - - [4096, 512, 1, 7168] + - [15, 79.298] + - - [4096, 1024, 1, 896] + - [26, 76.866] + - - [4096, 1024, 1, 4096] + - [15, 77.11] + - - [2048, 1024, 1, 6144] + - [15, 79.515] + - - [4096, 512, 1, 4160] + - [7, 79.519] + - - [4096, 512, 1, 2080] + - [24, 78.68] + - - [4096, 1024, 1, 5120] + - [15, 77.358] + - - [2048, 1024, 1, 1920] + - [43, 78.554] + - - [2048, 1024, 1, 15360] + - [13, 69.408] + - - [4096, 1024, 1, 2816] + - [13, 77.06] + - - [4096, 512, 1, 256] + - [30, 67.545] + - - [2048, 1024, 1, 5120] + - [15, 79.312] + - - [2048, 1024, 1, 4096] + - [15, 79.145] + - - [4096, 512, 1, 4608] + - [51, 79.262] + - - [4096, 512, 1, 1664] + - [24, 78.143] + - - [2048, 1024, 1, 896] + - [24, 77.029] + - - [4096, 1024, 1, 4160] + - [37, 80.412] + - - [2048, 1024, 1, 11264] + - [51, 79.406] + - - [2048, 1024, 1, 384] + - [9, 69.819] + - - [2048, 1024, 1, 3840] + - [43, 79.447] + - - [4096, 512, 1, 1280] + - [43, 77.119] + - - [4096, 1024, 1, 1152] + - [26, 77.917] + - - [2048, 1024, 1, 1408] + - [24, 78.22] + - - [4096, 512, 1, 896] + - [43, 76.641] + - - [4096, 1024, 1, 3072] + - [13, 77.29] + - - [2048, 1024, 1, 2816] + - [51, 78.901] + - - [4096, 1024, 1, 1792] + - [13, 77.011] + - - [4096, 512, 1, 1152] + - [24, 77.227] + - - [4096, 512, 1, 7680] + - [51, 79.474] + - - [4096, 1024, 1, 384] + - [26, 74.804] + - - [2048, 1024, 1, 1792] + - [51, 78.143] + - - [4096, 1024, 1, 3584] + - [13, 77.43] + - - [2048, 1024, 1, 768] + - [32, 76.09] + - - [2048, 1024, 1, 8320] + - [24, 79.921] + - - [4096, 512, 1, 2048] + - [15, 78.035] + - - [4096, 512, 1, 2560] + - [51, 78.504] + - - [4096, 1024, 1, 2304] + - [43, 77.295] + - - [4096, 512, 1, 5120] + - [15, 79.294] + - - [4096, 1024, 1, 6144] + - [13, 77.254] + - - [1024, 3392, 1, 4096] + - [31, 76.135] + - - [1024, 3301, 1, 4096] + - [32, 85.303] + - - [1024, 3443, 1, 4096] + - [31, 75.107] + - - [132, 134, 480, 64] + - [36, 28.284] + - - [162, 162, 400, 64] + - [0, 39.532] + - - [4096, 3548, 1, 1024] + - [13, 86.323] + - - [4096, 2977, 1, 1024] + - [15, 83.485] + - - [132, 135, 480, 64] + - [36, 28.469] + - - [1024, 2985, 1, 4096] + - [15, 77.448] + - - [33708, 3681, 1, 1024] + - [49, 88.308] + - - [4096, 3443, 1, 1024] + - [13, 87.13] + - - [1024, 3400, 1, 4096] + - [13, 78.734] + - - [4096, 3995, 1, 1024] + - [13, 84.726] + - - [4096, 3190, 1, 1024] + - [13, 84.987] + - - [4096, 3594, 1, 1024] + - [13, 84.586] + - - [159, 162, 400, 64] + - [0, 39.216] + - - [1024, 3565, 1, 4096] + - [30, 70.288] + - - [4096, 3422, 1, 1024] + - [13, 86.291] + - - [1024, 3214, 1, 4096] + - [51, 83.11] + - - [33708, 3584, 1, 1024] + - [51, 89.562] + - - [33708, 3640, 1, 1024] + - [49, 87.365] + - - [4096, 3263, 1, 1024] + - [15, 83.905] + - - [4096, 3296, 1, 1024] + - [15, 85.123] + - - [1024, 3557, 1, 4096] + - [13, 75.373] + - - [4096, 3463, 1, 1024] + - [13, 84.265] + - - [4096, 3528, 1, 1024] + - [13, 86.057] + - - [4096, 3226, 1, 1024] + - [15, 83.205] + - - [4096, 3439, 1, 1024] + - [13, 87.216] + - - [1024, 3523, 1, 4096] + - [13, 72.878] + - - [1024, 3098, 1, 4096] + - [51, 80.399] + - - [4096, 3121, 1, 1024] + - [13, 82.772] + - - [33708, 3894, 1, 1024] + - [49, 87.225] + - - [1024, 3548, 1, 4096] + - [13, 72.436] + - - [1024, 3451, 1, 4096] + - [30, 73.121] + - - [4096, 3353, 1, 1024] + - [13, 84.441] + - - [4096, 3402, 1, 1024] + - [13, 86.323] + - - [4096, 3939, 1, 1024] + - [13, 86.264] + - - [133, 133, 480, 64] + - [17, 28.239] + - - [1024, 3559, 1, 4096] + - [13, 75.856] + - - [1024, 2977, 1, 4096] + - [32, 77.335] + - - [1024, 3478, 1, 4096] + - [30, 74.516] + - - [134, 134, 480, 64] + - [17, 28.758] + - - [1024, 3368, 1, 4096] + - [31, 75.662] + - - [4096, 4012, 1, 1024] + - [13, 85.118] + - - [4096, 3486, 1, 1024] + - [13, 85.154] + - - [1024, 3479, 1, 4096] + - [30, 71.7] + - - [1024, 3505, 1, 4096] + - [13, 73.559] + - - [4096, 3381, 1, 1024] + - [13, 85.515] + - - [4096, 3430, 1, 1024] + - [13, 87.013] + - - [1024, 3554, 1, 4096] + - [30, 70.013] + - - [4096, 3271, 1, 1024] + - [15, 84.356] + - - [1024, 3063, 1, 4096] + - [15, 79.384] + - - [1024, 3209, 1, 4096] + - [32, 83.165] + - - [4096, 3503, 1, 1024] + - [13, 85.303] + - - [4096, 3344, 1, 1024] + - [13, 84.55] + - - [1024, 3147, 1, 4096] + - [15, 81.536] + - - [1024, 3322, 1, 4096] + - [32, 85.822] + - - [1024, 3341, 1, 4096] + - [31, 74.43] + - - [1024, 3516, 1, 4096] + - [13, 67.414] + - - [1024, 3454, 1, 4096] + - [31, 77.277] + - - [4096, 3969, 1, 1024] + - [13, 84.238] + - - [4096, 3466, 1, 1024] + - [13, 84.721] + - - [1024, 3999, 1, 1024] + - [32, 75.955] + - - [1024, 4032, 1, 1024] + - [32, 76.704] + - - [1024, 3403, 1, 4096] + - [30, 81.062] + - - [4096, 3361, 1, 1024] + - [13, 85.393] + - - [1024, 3527, 1, 4096] + - [13, 75.332] + - - [1024, 3822, 1, 4096] + - [13, 78.075] + - - [4096, 3315, 1, 1024] + - [15, 85.511] + - - [232, 232, 272, 64] + - [42, 47.585] + - - [1024, 3336, 1, 4096] + - [13, 77.665] + - - [228, 232, 272, 64] + - [6, 45.207] + - - [4096, 3547, 1, 1024] + - [13, 86.562] + - - [4096, 3340, 1, 1024] + - [13, 84.92] + - - [1024, 3906, 1, 1024] + - [30, 81.234] + - - [1024, 3295, 1, 4096] + - [15, 85.109] + - - [4096, 3294, 1, 1024] + - [15, 84.807] + - - [33708, 3968, 1, 1024] + - [49, 89.021] + - - [1024, 3473, 1, 4096] + - [13, 72.151] + - - [1024, 3072, 1, 4096] + - [15, 80.052] + - - [4096, 3189, 1, 1024] + - [13, 84.771] + - - [4096, 3494, 1, 1024] + - [13, 85.1] + - - [1024, 3522, 1, 4096] + - [30, 75.436] + - - [33708, 3944, 1, 1024] + - [49, 88.371] + - - [135, 135, 480, 64] + - [45, 28.848] + - - [4096, 3421, 1, 1024] + - [13, 86.914] + - - [4096, 3311, 1, 1024] + - [15, 85.461] + - - [1024, 3990, 1, 1024] + - [32, 75.856] + - - [1024, 3290, 1, 4096] + - [51, 84.978] + - - [4096, 3565, 1, 1024] + - [13, 86.724] + - - [1024, 3484, 1, 4096] + - [13, 72.697] + - - [4096, 3384, 1, 1024] + - [13, 85.673] + - - [1024, 3422, 1, 4096] + - [13, 77.94] + - - [4096, 3681, 1, 1024] + - [13, 86.418] + - - [1024, 3584, 1, 1024] + - [30, 75.756] + - - [4096, 4050, 1, 1024] + - [13, 85.741] + - - [1024, 3996, 1, 4096] + - [32, 78.378] + - - [4096, 3169, 1, 1024] + - [13, 84.202] + - - [4096, 3538, 1, 1024] + - [13, 86.323] + - - [1024, 3495, 1, 4096] + - [13, 68.975] + - - [4096, 3401, 1, 1024] + - [13, 86.106] + - - [1024, 3560, 1, 4096] + - [13, 76.162] + - - [133, 135, 480, 64] + - [17, 28.564] + - - [1024, 3263, 1, 4096] + - [51, 84.45] + - - [1024, 3870, 1, 4096] + - [13, 79.749] + - - [4096, 3555, 1, 1024] + - [13, 86.756] + - - [4096, 3412, 1, 1024] + - [13, 86.729] + - - [1024, 3296, 1, 4096] + - [51, 85.168] + - - [1024, 3379, 1, 4096] + - [31, 75.12] + - - [4096, 3302, 1, 1024] + - [15, 85.032] + - - [1024, 3490, 1, 4096] + - [13, 72.03] + - - [1024, 3428, 1, 4096] + - [13, 76.469] + - - [1024, 3976, 1, 4096] + - [32, 78.202] + - - [4096, 3485, 1, 1024] + - [13, 84.902] + - - [4096, 3534, 1, 1024] + - [13, 86.246] + - - [1024, 3064, 1, 4096] + - [32, 79.366] + - - [4096, 3216, 1, 1024] + - [15, 83.029] + - - [1024, 3450, 1, 4096] + - [14, 76.478] + - - [1024, 3533, 1, 4096] + - [13, 71.213] + - - [1024, 4030, 1, 1024] + - [32, 76.591] + - - [1024, 3311, 1, 4096] + - [15, 85.565] + - - [1024, 3468, 1, 4096] + - [30, 69.909] + - - [4096, 3359, 1, 1024] + - [13, 85.127] + - - [4096, 3392, 1, 1024] + - [13, 86.102] + - - [1024, 3925, 1, 1024] + - [30, 81.734] + - - [4096, 3233, 1, 1024] + - [15, 83.354] + - - [4096, 3956, 1, 1024] + - [13, 86.679] + - - [1024, 3463, 1, 4096] + - [30, 74.34] + - - [1024, 3126, 1, 4096] + - [32, 81.143] + - - [1024, 3363, 1, 4096] + - [30, 81.337] + - - [4096, 3465, 1, 1024] + - [13, 84.432] + - - [33708, 3996, 1, 1024] + - [51, 88.024] + - - [1024, 3231, 1, 4096] + - [51, 83.196] + - - [33708, 3978, 1, 1024] + - [51, 87.275] + - - [4096, 3476, 1, 1024] + - [13, 84.563] + - - [4096, 3339, 1, 1024] + - [13, 84.563] + - - [4096, 3452, 1, 1024] + - [13, 87.532] + - - [1024, 3396, 1, 4096] + - [13, 81.707] + - - [4096, 3293, 1, 1024] + - [15, 84.581] + - - [1024, 3432, 1, 4096] + - [13, 79.316] + - - [4096, 3493, 1, 1024] + - [13, 85.254] + - - [4096, 3350, 1, 1024] + - [13, 85.105] + - - [1024, 3079, 1, 4096] + - [51, 79.867] + - - [1024, 3101, 1, 4096] + - [32, 80.516] + - - [33708, 3939, 1, 1024] + - [49, 88.213] + - - [4096, 3256, 1, 1024] + - [15, 83.643] + - - [1024, 3439, 1, 4096] + - [13, 76.474] + - - [1024, 3510, 1, 4096] + - [13, 71.123] + - - [4096, 3900, 1, 1024] + - [13, 85.217] + - - [1024, 3470, 1, 4096] + - [30, 69.228] + - - [4096, 3456, 1, 1024] + - [13, 87.478] + - - [4096, 3014, 1, 1024] + - [15, 84.455] + - - [4096, 3367, 1, 1024] + - [13, 85.393] + - - [4096, 3432, 1, 1024] + - [13, 86.837] + - - [33708, 4026, 1, 1024] + - [51, 88.231] + - - [4096, 3273, 1, 1024] + - [15, 83.963] + - - [4096, 3130, 1, 1024] + - [13, 83.282] + - - [1024, 3496, 1, 4096] + - [13, 73.713] + - - [1024, 3995, 1, 4096] + - [32, 78.563] + - - [1024, 3939, 1, 4096] + - [13, 78.585] + - - [1024, 3121, 1, 4096] + - [32, 80.927] + - - [1024, 3232, 1, 4096] + - [15, 83.688] + - - [4096, 3147, 1, 1024] + - [13, 83.76] + - - [4096, 3516, 1, 1024] + - [13, 85.745] + - - [1024, 3969, 1, 1024] + - [32, 75.174] + - - [1024, 3364, 1, 4096] + - [31, 74.705] + - - [4096, 3411, 1, 1024] + - [13, 86.35] + - - [147, 147, 432, 64] + - [0, 33.888] + - - [4096, 3301, 1, 1024] + - [15, 84.897] + - - [1024, 3513, 1, 4096] + - [30, 71.303] + - - [1024, 3469, 1, 4096] + - [13, 71.574] + - - [1024, 3095, 1, 4096] + - [15, 80.268] + - - [4096, 3533, 1, 1024] + - [13, 85.633] + - - [4096, 3390, 1, 1024] + - [13, 85.881] + - - [4096, 3582, 1, 1024] + - [13, 87.622] + - - [1024, 3956, 1, 1024] + - [30, 82.267] + - - [4096, 3585, 1, 1024] + - [13, 84.45] + - - [4096, 3231, 1, 1024] + - [15, 83.201] + - - [1024, 3205, 1, 4096] + - [15, 82.957] + - - [4096, 3496, 1, 1024] + - [13, 85.163] + - - [1024, 3143, 1, 4096] + - [15, 81.464] + - - [1024, 3318, 1, 4096] + - [32, 85.786] + - - [1024, 3353, 1, 4096] + - [13, 79.88] + - - [1024, 3464, 1, 4096] + - [30, 72.102] + - - [4096, 2736, 1, 1024] + - [15, 84.216] + - - [1024, 3402, 1, 4096] + - [13, 81.888] + - - [4096, 3138, 1, 1024] + - [13, 83.738] + - - [1024, 3860, 1, 4096] + - [32, 75.914] + - - [148, 148, 432, 64] + - [0, 34.258] + - - [1024, 3539, 1, 4096] + - [13, 73.049] + - - [4096, 3211, 1, 1024] + - [15, 82.759] + - - [1024, 3332, 1, 4096] + - [13, 76.663] + - - [1024, 3466, 1, 4096] + - [13, 71.682] + - - [4096, 3475, 1, 1024] + - [13, 84.518] + - - [4096, 3524, 1, 1024] + - [13, 85.763] + - - [4096, 2985, 1, 1024] + - [15, 83.512] + - - [4096, 3222, 1, 1024] + - [15, 83.088] + - - [4096, 3451, 1, 1024] + - [13, 87.324] + - - [1024, 3181, 1, 4096] + - [32, 82.325] + - - [1024, 3640, 1, 4096] + - [13, 75.702] + - - [1024, 3375, 1, 4096] + - [30, 82.916] + - - [1024, 3550, 1, 4096] + - [13, 75.95] + - - [1024, 4020, 1, 1024] + - [15, 76.126] + - - [4096, 3349, 1, 1024] + - [13, 84.947] + - - [4096, 3398, 1, 1024] + - [13, 86.345] + - - [33708, 3976, 1, 1024] + - [51, 87.279] + - - [1024, 2917, 1, 4096] + - [32, 75.553] + - - [33708, 3910, 1, 1024] + - [49, 87.667] + - - [4096, 3860, 1, 1024] + - [13, 84.238] + - - [4096, 3304, 1, 1024] + - [15, 84.987] + - - [1024, 3286, 1, 4096] + - [51, 85.154] + - - [1024, 3460, 1, 4096] + - [13, 73.762] + - - [1024, 4026, 1, 4096] + - [32, 79.235] + - - [4096, 3471, 1, 1024] + - [13, 84.572] + - - [193, 193, 320, 64] + - [37, 38.846] + - - [1024, 3894, 1, 1024] + - [30, 80.999] + - - [1024, 3506, 1, 4096] + - [13, 74.886] + - - [1024, 4000, 1, 1024] + - [32, 75.955] + - - [1024, 3900, 1, 4096] + - [13, 81.27] + - - [1024, 3445, 1, 4096] + - [13, 83.368] + - - [4096, 3442, 1, 1024] + - [13, 87.221] + - - [1024, 3358, 1, 4096] + - [30, 73.32] + - - [1024, 3211, 1, 4096] + - [32, 83.277] + - - [4096, 3515, 1, 1024] + - [13, 85.538] + - - [1024, 3564, 1, 4096] + - [30, 71.394] + - - [4096, 3057, 1, 1024] + - [15, 85.623] + - - [1024, 3343, 1, 4096] + - [13, 80.516] + - - [4096, 3262, 1, 1024] + - [15, 84.071] + - - [1024, 3518, 1, 4096] + - [13, 71.001] + - - [33708, 3876, 1, 1024] + - [49, 86.905] + - - [4096, 3462, 1, 1024] + - [13, 84.459] + - - [1024, 3265, 1, 4096] + - [51, 84.563] + - - [4096, 3389, 1, 1024] + - [13, 85.47] + - - [4096, 3438, 1, 1024] + - [13, 87.248] + - - [1024, 3955, 1, 1024] + - [30, 82.289] + - - [1024, 3545, 1, 4096] + - [13, 72.643] + - - [1024, 3144, 1, 4096] + - [15, 81.473] + - - [1024, 3417, 1, 4096] + - [13, 82.763] + - - [4096, 3543, 1, 1024] + - [13, 86.639] + - - [4096, 3352, 1, 1024] + - [13, 84.996] + - - [33708, 3975, 1, 1024] + - [32, 87.207] + - - [148, 147, 432, 64] + - [36, 33.888] + - - [4096, 3137, 1, 1024] + - [13, 83.413] + - - [4096, 3506, 1, 1024] + - [13, 85.276] + - - [1024, 3975, 1, 1024] + - [32, 75.711] + - - [1024, 3859, 1, 4096] + - [32, 75.82] + - - [4096, 3369, 1, 1024] + - [13, 85.168] + - - [1024, 3434, 1, 4096] + - [31, 77.205] + - - [1024, 3292, 1, 4096] + - [15, 85.001] + - - [4096, 3523, 1, 1024] + - [13, 85.484] + - - [4096, 3380, 1, 1024] + - [13, 85.926] + - - [1024, 3408, 1, 4096] + - [30, 82.208] + - - [4096, 3221, 1, 1024] + - [15, 83.07] + - - [4096, 3270, 1, 1024] + - [15, 84.414] + - - [143, 143, 432, 64] + - [0, 32.254] + - - [1024, 3303, 1, 4096] + - [32, 85.457] + - - [4096, 3502, 1, 1024] + - [13, 85.402] + - - [1024, 3222, 1, 4096] + - [51, 83.309] + - - [4096, 2505, 1, 1024] + - [15, 85.551] + - - [4096, 3397, 1, 1024] + - [13, 86.102] + - - [4096, 3562, 1, 1024] + - [13, 86.688] + - - [4096, 3095, 1, 1024] + - [13, 82.506] + - - [1024, 3226, 1, 4096] + - [32, 83.485] + - - [177, 177, 352, 64] + - [17, 42.04] + - - [4096, 3360, 1, 1024] + - [13, 85.294] + - - [1024, 3942, 1, 1024] + - [30, 81.897] + - - [1024, 3298, 1, 4096] + - [51, 85.127] + - - [1024, 3381, 1, 4096] + - [13, 76.726] + - - [4096, 3314, 1, 1024] + - [15, 85.466] + - - [1024, 3492, 1, 4096] + - [13, 69.377] + - - [1024, 3430, 1, 4096] + - [31, 76.902] + - - [4096, 3977, 1, 1024] + - [13, 83.914] + - - [4096, 3546, 1, 1024] + - [13, 86.327] + - - [4096, 3640, 1, 1024] + - [13, 85.601] + - - [4096, 3441, 1, 1024] + - [13, 87.031] + - - [33708, 4059, 1, 1024] + - [32, 88.994] + - - [1024, 3978, 1, 1024] + - [32, 74.872] + - - [1024, 3376, 1, 4096] + - [13, 76.37] + - - [1024, 3482, 1, 4096] + - [13, 71.926] + - - [1024, 3563, 1, 4096] + - [30, 74.042] + - - [4096, 4020, 1, 1024] + - [13, 84.902] + - - [1024, 3271, 1, 4096] + - [51, 84.608] + - - [1024, 3291, 1, 4096] + - [32, 85.15] + - - [1024, 3431, 1, 4096] + - [14, 76.158] + - - [1024, 3481, 1, 4096] + - [13, 73.722] + - - [4096, 3461, 1, 1024] + - [13, 83.877] + - - [1024, 3574, 1, 4096] + - [49, 70.081] + - - [1024, 4059, 1, 1024] + - [32, 76.794] + - - [1024, 3421, 1, 4096] + - [13, 75.995] + - - [4096, 3224, 1, 1024] + - [15, 83.106] + - - [4096, 3437, 1, 1024] + - [13, 87.099] + - - [4096, 3168, 1, 1024] + - [13, 84.378] + - - [33708, 3990, 1, 1024] + - [51, 87.672] + - - [1024, 3349, 1, 4096] + - [30, 75.585] + - - [4096, 3335, 1, 1024] + - [13, 84.699] + - - [4096, 3400, 1, 1024] + - [13, 86.4] + - - [160, 159, 400, 64] + - [17, 39.694] + - - [1024, 3398, 1, 4096] + - [14, 76.614] + - - [1024, 3780, 1, 4096] + - [30, 76.875] + - - [4096, 3098, 1, 1024] + - [13, 82.47] + - - [1024, 4012, 1, 4096] + - [15, 78.761] + - - [4096, 3505, 1, 1024] + - [13, 85.362] + - - [4096, 3554, 1, 1024] + - [13, 86.661] + - - [4096, 3063, 1, 1024] + - [15, 85.953] + - - [1024, 3503, 1, 4096] + - [13, 70.915] + - - [1024, 3166, 1, 4096] + - [32, 82.113] + - - [1024, 3425, 1, 4096] + - [31, 76.911] + - - [1024, 3344, 1, 4096] + - [49, 73.072] + - - [4096, 3484, 1, 1024] + - [13, 84.712] + - - [1024, 3681, 1, 1024] + - [30, 76.722] + - - [1024, 4050, 1, 1024] + - [32, 76.74] + - - [4096, 3379, 1, 1024] + - [13, 85.538] + - - [4096, 3428, 1, 1024] + - [13, 86.932] + - - [1024, 3304, 1, 4096] + - [32, 85.362] + - - [1024, 3387, 1, 4096] + - [30, 79.262] + - - [4096, 3126, 1, 1024] + - [13, 83.413] + - - [1024, 3498, 1, 4096] + - [30, 74.809] + - - [1024, 3436, 1, 4096] + - [13, 82.596] + - - [4096, 3501, 1, 1024] + - [13, 85.452] + - - [4096, 3358, 1, 1024] + - [13, 85.402] + - - [4096, 3232, 1, 1024] + - [15, 83.449] + - - [1024, 3585, 1, 4096] + - [13, 76.253] + - - [4096, 3143, 1, 1024] + - [13, 83.724] + - - [4096, 3464, 1, 1024] + - [13, 84.473] + - - [1024, 3366, 1, 4096] + - [13, 76.889] + - - [4096, 3375, 1, 1024] + - [13, 85.655] + - - [4096, 2917, 1, 1024] + - [13, 84.856] + - - [4096, 4026, 1, 1024] + - [13, 85.416] + - - [1024, 3277, 1, 4096] + - [51, 84.685] + - - [1024, 3103, 1, 4096] + - [15, 80.561] + - - [33708, 3995, 1, 1024] + - [32, 87.676] + - - [1024, 3297, 1, 4096] + - [32, 84.604] + - - [4096, 3545, 1, 1024] + - [13, 86.467] + - - [1024, 3399, 1, 4096] + - [30, 83.607] + - - [33708, 3796, 1, 1024] + - [51, 88.592] + - - [4096, 3292, 1, 1024] + - [15, 84.545] + - - [33708, 3859, 1, 1024] + - [49, 86.625] + - - [4096, 3566, 1, 1024] + - [13, 86.648] + - - [4096, 3894, 1, 1024] + - [13, 85.163] + - - [4096, 3492, 1, 1024] + - [13, 85.087] + - - [1024, 3977, 1, 1024] + - [32, 75.874] + - - [1024, 3272, 1, 4096] + - [32, 84.45] + - - [135, 134, 480, 64] + - [17, 28.699] + - - [1024, 3355, 1, 4096] + - [13, 74.272] + - - [4096, 3419, 1, 1024] + - [13, 86.39] + - - [1024, 3404, 1, 4096] + - [30, 79.37] + - - [4096, 3999, 1, 1024] + - [13, 84.608] + - - [4096, 3166, 1, 1024] + - [13, 84.523] + - - [33708, 3840, 1, 1024] + - [51, 89.792] + - - [4096, 4032, 1, 1024] + - [13, 85.339] + - - [1024, 3573, 1, 4096] + - [30, 73.591] + - - [4096, 3366, 1, 1024] + - [13, 85.181] + - - [1024, 3541, 1, 4096] + - [13, 74.299] + - - [4096, 3207, 1, 1024] + - [15, 82.497] + - - [4096, 3272, 1, 1024] + - [15, 84.491] + - - [1024, 3334, 1, 4096] + - [13, 74.8] + - - [228, 228, 272, 64] + - [4, 46.06] + - - [4096, 3183, 1, 1024] + - [13, 84.703] + - - [4096, 3536, 1, 1024] + - [13, 85.799] + - - [1024, 4005, 1, 1024] + - [32, 76.027] + - - [1024, 3245, 1, 4096] + - [15, 83.868] + - - [4096, 3447, 1, 1024] + - [13, 86.927] + - - [1024, 3183, 1, 4096] + - [32, 82.312] + - - [1024, 3361, 1, 4096] + - [49, 72.747] + - - [33708, 3870, 1, 1024] + - [30, 86.63] + - - [1024, 3321, 1, 4096] + - [15, 85.384] + - - [1024, 3486, 1, 4096] + - [30, 74.507] + - - [4096, 4005, 1, 1024] + - [13, 84.969] + - - [4096, 3410, 1, 1024] + - [13, 86.332] + - - [1024, 3944, 1, 1024] + - [30, 82.213] + - - [4096, 3300, 1, 1024] + - [15, 85.231] + - - [4096, 3579, 1, 1024] + - [13, 87.117] + - - [4096, 3483, 1, 1024] + - [13, 84.996] + - - [4096, 3532, 1, 1024] + - [13, 85.885] + - - [1024, 3140, 1, 4096] + - [15, 81.337] + - - [1024, 3372, 1, 4096] + - [13, 77.421] + - - [1024, 3224, 1, 4096] + - [32, 83.535] + - - [4096, 3230, 1, 1024] + - [15, 83.413] + - - [4096, 3427, 1, 1024] + - [13, 86.747] + - - [1024, 3796, 1, 1024] + - [30, 79.072] + - - [143, 148, 432, 64] + - [17, 33.098] + - - [1024, 3616, 1, 4096] + - [13, 73.812] + - - [1024, 3315, 1, 4096] + - [32, 85.651] + - - [1024, 3476, 1, 4096] + - [49, 71.055] + - - [1024, 3509, 1, 4096] + - [13, 75.116] + - - [4096, 3357, 1, 1024] + - [13, 85.195] + - - [4096, 3406, 1, 1024] + - [13, 86.679] + - - [1024, 3558, 1, 4096] + - [13, 75.959] + - - [4096, 3593, 1, 1024] + - [13, 84.459] + - - [4096, 3247, 1, 1024] + - [15, 83.855] + - - [4096, 3088, 1, 1024] + - [13, 82.389] + - - [1024, 3213, 1, 4096] + - [32, 83.183] + - - [4096, 3511, 1, 1024] + - [13, 85.605] + - - [1024, 3365, 1, 4096] + - [13, 75.838] + - - [1024, 3504, 1, 4096] + - [30, 71.448] + - - [1024, 3442, 1, 4096] + - [13, 84.284] + - - [4096, 3474, 1, 1024] + - [13, 84.577] + - - [4096, 2984, 1, 1024] + - [15, 83.683] + - - [1024, 3876, 1, 4096] + - [32, 76.235] + - - [4096, 3337, 1, 1024] + - [13, 84.302] + - - [4096, 3450, 1, 1024] + - [13, 87.23] + - - [1024, 3547, 1, 4096] + - [13, 71.777] + - - [4096, 3291, 1, 1024] + - [15, 84.649] + - - [1024, 3340, 1, 4096] + - [30, 73.843] + - - [4096, 3491, 1, 1024] + - [13, 84.983] + - - [4096, 3348, 1, 1024] + - [13, 84.911] + - - [4096, 3906, 1, 1024] + - [13, 85.439] + - - [1024, 3477, 1, 4096] + - [49, 71.213] + - - [1024, 3397, 1, 4096] + - [13, 83.151] + - - [4096, 3165, 1, 1024] + - [13, 83.972] + - - [4096, 3470, 1, 1024] + - [13, 84.726] + - - [1024, 3526, 1, 4096] + - [30, 72.824] + - - [4096, 3365, 1, 1024] + - [13, 85.362] + - - [4096, 3319, 1, 1024] + - [15, 85.511] + - - [1024, 3401, 1, 4096] + - [30, 83.638] + - - [1024, 3294, 1, 4096] + - [15, 85.159] + - - [159, 159, 400, 64] + - [0, 38.778] + - - [1024, 3472, 1, 4096] + - [13, 70.775] + - - [4096, 3328, 1, 1024] + - [15, 85.984] + - - [1024, 3861, 1, 1024] + - [30, 79.763] + - - [1024, 3910, 1, 1024] + - [30, 81.423] + - - [1024, 3410, 1, 4096] + - [31, 76.469] + - - [1024, 3395, 1, 4096] + - [13, 79.366] + - - [4096, 3282, 1, 1024] + - [15, 84.496] + - - [1024, 3751, 1, 1024] + - [49, 78.12] + - - [4096, 3145, 1, 1024] + - [13, 83.512] + - - [4096, 3514, 1, 1024] + - [13, 85.529] + - - [4096, 3944, 1, 1024] + - [13, 86.386] + - - [1024, 3515, 1, 4096] + - [13, 70.694] + - - [4096, 3409, 1, 1024] + - [13, 86.052] + - - [4096, 3564, 1, 1024] + - [13, 86.819] + - - [4096, 3299, 1, 1024] + - [15, 85.028] + - - [1024, 3057, 1, 4096] + - [51, 79.14] + - - [4096, 3531, 1, 1024] + - [13, 85.795] + - - [4096, 3388, 1, 1024] + - [13, 85.998] + - - [1024, 3189, 1, 4096] + - [32, 82.637] + - - [1024, 3300, 1, 4096] + - [15, 85.33] + - - [1024, 3720, 1, 4096] + - [13, 79.438] + - - [1024, 3383, 1, 4096] + - [30, 77.15] + - - [1024, 3494, 1, 4096] + - [30, 73.776] + - - [1024, 3448, 1, 4096] + - [13, 84.193] + - - [4096, 3542, 1, 1024] + - [13, 86.341] + - - [1024, 3488, 1, 4096] + - [13, 74.114] + - - [4096, 3405, 1, 1024] + - [13, 86.219] + - - [1024, 3262, 1, 4096] + - [51, 84.396] + - - [33708, 4005, 1, 1024] + - [51, 87.965] + - - [1024, 3594, 1, 4096] + - [30, 71.123] + - - [4096, 3103, 1, 1024] + - [13, 82.362] + - - [4096, 3136, 1, 1024] + - [13, 83.498] + - - [1024, 3378, 1, 4096] + - [31, 75.314] + - - [4096, 3559, 1, 1024] + - [13, 86.702] + - - [4096, 3368, 1, 1024] + - [13, 85.479] + - - [4096, 3209, 1, 1024] + - [15, 82.795] + - - [4096, 3322, 1, 1024] + - [15, 85.7] + - - [1024, 3483, 1, 4096] + - [13, 73.424] + - - [4096, 3473, 1, 1024] + - [13, 84.509] + - - [4096, 3522, 1, 1024] + - [13, 85.845] + - - [1024, 3532, 1, 4096] + - [30, 72.788] + - - [4096, 3449, 1, 1024] + - [13, 87.37] + - - [1024, 3351, 1, 4096] + - [31, 74.443] + - - [1024, 3462, 1, 4096] + - [13, 72.991] + - - [4096, 3396, 1, 1024] + - [13, 86.187] + - - [132, 132, 480, 64] + - [0, 27.756] + - - [1024, 3416, 1, 4096] + - [13, 78.251] + - - [4096, 3469, 1, 1024] + - [13, 84.419] + - - [1024, 3582, 1, 4096] + - [13, 74.849] + - - [1024, 3230, 1, 4096] + - [32, 83.715] + - - [1024, 3489, 1, 4096] + - [30, 71.087] + - - [1024, 3427, 1, 4096] + - [14, 76.347] + - - [1024, 3346, 1, 4096] + - [13, 73.789] + - - [33708, 3977, 1, 1024] + - [51, 87.176] + - - [4096, 3796, 1, 1024] + - [13, 85.975] + - - [4096, 3176, 1, 1024] + - [13, 84.55] + - - [4096, 3990, 1, 1024] + - [13, 84.496] + - - [1024, 3257, 1, 4096] + - [51, 84.035] + - - [4096, 3343, 1, 1024] + - [13, 84.387] + - - [4096, 3440, 1, 1024] + - [13, 87.365] + - - [33708, 4030, 1, 1024] + - [32, 88.326] + - - [1024, 3190, 1, 4096] + - [51, 82.204] + - - [1024, 3389, 1, 4096] + - [30, 75.021] + - - [1024, 3500, 1, 4096] + - [13, 72.12] + - - [1024, 3471, 1, 4096] + - [30, 68.632] + - - [1024, 3438, 1, 4096] + - [13, 80.755] + - - [4096, 3513, 1, 1024] + - [13, 85.502] + - - [1024, 3562, 1, 4096] + - [13, 72.567] + - - [4096, 3616, 1, 1024] + - [13, 84.658] + - - [4096, 3955, 1, 1024] + - [13, 86.422] + - - [1024, 3441, 1, 4096] + - [30, 84.401] + - - [1024, 3236, 1, 4096] + - [32, 83.787] + - - [1024, 3524, 1, 4096] + - [30, 75.165] + - - [4096, 3460, 1, 1024] + - [13, 84.396] + - - [1024, 3384, 1, 4096] + - [30, 79.97] + - - [4096, 3387, 1, 1024] + - [13, 86.07] + - - [4096, 3436, 1, 1024] + - [13, 87.067] + - - [4096, 3277, 1, 1024] + - [15, 84.554] + - - [1024, 3457, 1, 4096] + - [13, 73.55] + - - [1024, 3999, 1, 4096] + - [32, 78.662] + - - [1024, 4032, 1, 4096] + - [32, 79.303] + - - [4096, 3541, 1, 1024] + - [13, 86.183] + - - [4096, 3334, 1, 1024] + - [13, 84.649] + - - [1024, 3393, 1, 4096] + - [31, 76.469] + - - [1024, 3411, 1, 4096] + - [31, 76.573] + - - [1024, 3822, 1, 1024] + - [30, 79.456] + - - [1024, 3593, 1, 4096] + - [32, 70.802] + - - [33708, 3822, 1, 1024] + - [51, 89.332] + - - [4096, 3504, 1, 1024] + - [13, 85.285] + - - [1024, 3163, 1, 4096] + - [51, 81.829] + - - [1024, 3357, 1, 4096] + - [13, 81.68] + - - [1024, 3906, 1, 4096] + - [32, 77.024] + - - [4096, 3415, 1, 1024] + - [13, 86.702] + - - [1024, 3406, 1, 4096] + - [30, 82.74] + - - [4096, 3321, 1, 1024] + - [15, 85.402] + - - [4096, 3584, 1, 1024] + - [13, 87.46] + - - [1024, 2736, 1, 4096] + - [13, 74.024] + - - [1024, 3110, 1, 4096] + - [15, 80.697] + - - [33708, 3999, 1, 1024] + - [51, 87.812] + - - [1024, 3093, 1, 4096] + - [51, 79.88] + - - [4096, 3378, 1, 1024] + - [13, 85.439] + - - [1024, 3543, 1, 4096] + - [13, 73.41] + - - [33708, 3925, 1, 1024] + - [49, 87.888] + - - [1024, 3352, 1, 4096] + - [14, 73.049] + - - [4096, 3780, 1, 1024] + - [13, 85.294] + - - [1024, 3990, 1, 4096] + - [32, 78.328] + - - [4096, 3500, 1, 1024] + - [13, 85.014] + - - [4096, 3996, 1, 1024] + - [13, 84.712] + - - [1024, 3247, 1, 4096] + - [15, 83.859] + - - [4096, 3395, 1, 1024] + - [13, 86.192] + - - [1024, 3169, 1, 4096] + - [15, 82.055] + - - [1024, 3088, 1, 4096] + - [15, 80.16] + - - [1024, 3584, 1, 4096] + - [30, 71.263] + - - [4096, 3093, 1, 1024] + - [13, 82.244] + - - [1024, 3538, 1, 4096] + - [30, 71.578] + - - [1024, 3996, 1, 1024] + - [32, 76.05] + - - [1024, 3581, 1, 4096] + - [13, 74.529] + - - [4096, 3374, 1, 1024] + - [13, 85.393] + - - [33708, 3751, 1, 1024] + - [51, 87.712] + - - [4096, 3215, 1, 1024] + - [15, 82.718] + - - [4096, 3312, 1, 1024] + - [15, 85.461] + - - [4096, 3581, 1, 1024] + - [13, 87.415] + - - [4096, 3479, 1, 1024] + - [13, 85.154] + - - [4096, 3544, 1, 1024] + - [13, 86.557] + - - [1024, 3870, 1, 1024] + - [30, 80.462] + - - [1024, 3374, 1, 4096] + - [14, 76.09] + - - [1024, 2967, 1, 4096] + - [15, 77.096] + - - [4096, 3455, 1, 1024] + - [13, 87.536] + - - [4096, 3942, 1, 1024] + - [13, 86.409] + - - [1024, 3528, 1, 4096] + - [13, 73.04] + - - [4096, 3186, 1, 1024] + - [13, 84.712] + - - [1024, 3976, 1, 1024] + - [32, 75.621] + - - [1024, 3511, 1, 4096] + - [13, 71.163] + - - [4096, 3573, 1, 1024] + - [13, 86.819] + - - [4096, 3561, 1, 1024] + - [13, 86.733] + - - [4096, 3418, 1, 1024] + - [13, 86.553] + - - [33708, 3906, 1, 1024] + - [49, 87.618] + - - [4096, 3259, 1, 1024] + - [15, 83.891] + - - [4096, 3308, 1, 1024] + - [15, 85.29] + - - [1024, 3419, 1, 4096] + - [13, 80.913] + - - [1024, 3215, 1, 4096] + - [15, 83.277] + - - [1024, 4030, 1, 4096] + - [32, 79.253] + - - [4096, 3459, 1, 1024] + - [13, 84.238] + - - [1024, 3572, 1, 4096] + - [30, 73.555] + - - [1024, 3137, 1, 4096] + - [51, 81.342] + - - [1024, 3312, 1, 4096] + - [32, 85.551] + - - [1024, 3925, 1, 4096] + - [30, 80.421] + - - [1024, 3453, 1, 4096] + - [13, 81.509] + - - [4096, 3435, 1, 1024] + - [13, 86.747] + - - [1024, 3176, 1, 4096] + - [32, 82.226] + - - [1024, 3444, 1, 4096] + - [13, 78.048] + - - [4096, 3975, 1, 1024] + - [13, 84.18] + - - [4096, 3182, 1, 1024] + - [13, 84.468] + - - [1024, 3475, 1, 4096] + - [13, 71.61] + - - [33708, 3955, 1, 1024] + - [49, 88.552] + - - [4096, 3446, 1, 1024] + - [13, 87.139] + - - [1024, 3138, 1, 4096] + - [51, 81.166] + - - [1024, 3549, 1, 4096] + - [13, 75.201] + - - [4096, 3287, 1, 1024] + - [15, 84.726] + - - [1024, 3342, 1, 4096] + - [13, 79.203] + - - [4096, 3519, 1, 1024] + - [13, 85.669] + - - [4096, 3552, 1, 1024] + - [13, 86.594] + - - [4096, 3859, 1, 1024] + - [13, 84.55] + - - [33708, 3969, 1, 1024] + - [51, 87.203] + - - [1024, 3369, 1, 4096] + - [31, 74.349] + - - [4096, 3482, 1, 1024] + - [13, 84.649] + - - [1024, 3306, 1, 4096] + - [32, 85.082] + - - [1024, 3474, 1, 4096] + - [30, 70.306] + - - [4096, 3377, 1, 1024] + - [13, 85.461] + - - [4096, 3426, 1, 1024] + - [13, 86.575] + - - [4096, 2935, 1, 1024] + - [13, 84.861] + - - [4096, 3267, 1, 1024] + - [15, 84.279] + - - [1024, 3299, 1, 4096] + - [51, 85.384] + - - [1024, 3456, 1, 4096] + - [13, 84.478] + - - [1024, 3280, 1, 4096] + - [32, 84.82] + - - [1024, 3555, 1, 4096] + - [13, 75.215] + - - [4096, 3499, 1, 1024] + - [13, 85.195] + - - [4096, 3356, 1, 1024] + - [13, 85.168] + - - [1024, 3412, 1, 4096] + - [14, 76.717] + - - [1024, 2984, 1, 4096] + - [32, 77.547] + - - [4096, 3141, 1, 1024] + - [13, 83.688] + - - [4096, 3510, 1, 1024] + - [13, 85.429] + - - [1024, 3995, 1, 1024] + - [32, 75.58] + - - [1024, 3517, 1, 4096] + - [13, 74.628] + - - [1024, 3455, 1, 4096] + - [13, 80.042] + - - [1024, 3939, 1, 1024] + - [30, 82.019] + - - [1024, 3447, 1, 4096] + - [31, 77.196] + - - [1024, 3969, 1, 4096] + - [15, 77.994] + - - [4096, 3527, 1, 1024] + - [13, 85.723] + - - [4096, 3336, 1, 1024] + - [13, 84.64] + - - [1024, 3191, 1, 4096] + - [32, 82.601] + - - [1024, 3302, 1, 4096] + - [15, 85.235] + - - [1024, 3337, 1, 4096] + - [31, 75.237] + - - [4096, 3290, 1, 1024] + - [15, 84.563] + - - [1024, 3512, 1, 4096] + - [13, 72.079] + - - [1024, 3433, 1, 4096] + - [31, 74.881] + - - [4096, 3876, 1, 1024] + - [13, 84.938] + - - [4096, 3490, 1, 1024] + - [13, 85.19] + - - [4096, 3064, 1, 1024] + - [15, 85.903] + - - [1024, 3508, 1, 4096] + - [13, 73.902] + - - [1024, 3956, 1, 4096] + - [13, 80.593] + - - [4096, 3417, 1, 1024] + - [13, 86.639] + - - [1024, 3248, 1, 4096] + - [51, 84.035] + - - [1024, 2499, 1, 4096] + - [13, 70.762] + - - [1024, 3186, 1, 4096] + - [51, 82.596] + - - [1024, 3180, 1, 4096] + - [15, 82.501] + - - [4096, 3364, 1, 1024] + - [13, 84.983] + - - [4096, 3976, 1, 1024] + - [13, 84.279] + - - [4096, 3205, 1, 1024] + - [15, 82.781] + - - [4096, 3318, 1, 1024] + - [15, 85.705] + - - [1024, 3377, 1, 4096] + - [31, 74.782] + - - [1024, 3485, 1, 4096] + - [49, 69.742] + - - [4096, 3181, 1, 1024] + - [13, 84.45] + - - [4096, 3550, 1, 1024] + - [13, 86.354] + - - [1024, 3534, 1, 4096] + - [30, 69.995] + - - [1024, 3860, 1, 1024] + - [30, 80.169] + - - [160, 160, 400, 64] + - [36, 40.01] + - - [4096, 3445, 1, 1024] + - [13, 87.329] + - - [1024, 3391, 1, 4096] + - [13, 75.874] + - - [1024, 3221, 1, 4096] + - [15, 83.395] + - - [4096, 3079, 1, 1024] + - [13, 81.888] + - - [4096, 3144, 1, 1024] + - [13, 83.607] + - - [1024, 3270, 1, 4096] + - [32, 84.518] + - - [1024, 3561, 1, 4096] + - [13, 73.428] + - - [1024, 3480, 1, 4096] + - [13, 73.785] + - - [4096, 3408, 1, 1024] + - [13, 86.354] + - - [1024, 3418, 1, 4096] + - [30, 79.659] + - - [4096, 3298, 1, 1024] + - [15, 84.906] + - - [1024, 3640, 1, 1024] + - [49, 76.05] + - - [1024, 3449, 1, 4096] + - [31, 77.358] + - - [1024, 4020, 1, 4096] + - [32, 79.063] + - - [4096, 3481, 1, 1024] + - [13, 84.771] + - - [4096, 3530, 1, 1024] + - [13, 86.219] + - - [1024, 3216, 1, 4096] + - [32, 83.065] + - - [1024, 3491, 1, 4096] + - [30, 72.345] + - - [1024, 3154, 1, 4096] + - [51, 81.743] + - - [4096, 3425, 1, 1024] + - [13, 86.688] + - - [1024, 3348, 1, 4096] + - [13, 75.747] + - - [1024, 3415, 1, 4096] + - [13, 77.101] + - - [1024, 4026, 1, 1024] + - [32, 76.108] + - - [1024, 3367, 1, 4096] + - [13, 77.769] + - - [1024, 3259, 1, 4096] + - [32, 84.378] + - - [1024, 3894, 1, 4096] + - [13, 78.292] + - - [4096, 3355, 1, 1024] + - [13, 84.762] + - - [4096, 3404, 1, 1024] + - [13, 86.278] + - - [1024, 3308, 1, 4096] + - [15, 85.389] + - - [4096, 3245, 1, 1024] + - [15, 83.656] + - - [1024, 3502, 1, 4096] + - [13, 74.863] + - - [33708, 4032, 1, 1024] + - [51, 88.506] + - - [1024, 3424, 1, 4096] + - [13, 77.809] + - - [4096, 3509, 1, 1024] + - [13, 85.457] + - - [4096, 3558, 1, 1024] + - [13, 86.729] + - - [1024, 3900, 1, 1024] + - [30, 81.071] + - - [1024, 2505, 1, 4096] + - [13, 71.574] + - - [4096, 3472, 1, 1024] + - [13, 84.617] + - - [1024, 3386, 1, 4096] + - [31, 75.806] + - - [4096, 3383, 1, 1024] + - [13, 85.691] + - - [4096, 3448, 1, 1024] + - [13, 87.148] + - - [4096, 4030, 1, 1024] + - [13, 85.484] + - - [4096, 3289, 1, 1024] + - [15, 84.753] + - - [1024, 3459, 1, 4096] + - [13, 69.684] + - - [1024, 2918, 1, 4096] + - [15, 75.919] + - - [4096, 3489, 1, 1024] + - [13, 84.96] + - - [4096, 3346, 1, 1024] + - [13, 84.902] + - - [4096, 3572, 1, 1024] + - [13, 87.203] + - - [1024, 3955, 1, 4096] + - [13, 83.823] + - - [4096, 3236, 1, 1024] + - [15, 83.413] + - - [4096, 3163, 1, 1024] + - [13, 84.356] + - - [4096, 3468, 1, 1024] + - [13, 84.676] + - - [1024, 3165, 1, 4096] + - [15, 81.987] + - - [1024, 3276, 1, 4096] + - [32, 84.902] + - - [1024, 3359, 1, 4096] + - [14, 75.684] + - - [4096, 3363, 1, 1024] + - [13, 85.06] + - - [1024, 3385, 1, 4096] + - [30, 79.772] + - - [1024, 3207, 1, 4096] + - [32, 83.097] + - - [1024, 3458, 1, 4096] + - [13, 70.816] + - - [4096, 3110, 1, 1024] + - [13, 82.695] + - - [4096, 3925, 1, 1024] + - [13, 85.935] + - - [1024, 3975, 1, 4096] + - [32, 78.139] + - - [4096, 3549, 1, 1024] + - [13, 86.282] + - - [4096, 3342, 1, 1024] + - [13, 84.712] + - - [1024, 3859, 1, 1024] + - [30, 80.354] + - - [1024, 3497, 1, 4096] + - [13, 71.394] + - - [4096, 3280, 1, 1024] + - [15, 84.184] + - - [1024, 3435, 1, 4096] + - [31, 77.078] + - - [1024, 3354, 1, 4096] + - [31, 75.459] + - - [4096, 3191, 1, 1024] + - [13, 84.838] + - - [4096, 3512, 1, 1024] + - [13, 85.714] + - - [1024, 3055, 1, 4096] + - [32, 79.091] + - - [4096, 2499, 1, 1024] + - [15, 85.105] + - - [1024, 3233, 1, 4096] + - [32, 83.787] + - - [4096, 3423, 1, 1024] + - [13, 86.661] + - - [1024, 3319, 1, 4096] + - [51, 85.646] + - - [4096, 3297, 1, 1024] + - [15, 84.942] + - - [4096, 3154, 1, 1024] + - [13, 83.918] + - - [1024, 3540, 1, 4096] + - [13, 72.977] + - - [1024, 3289, 1, 4096] + - [32, 85.114] + - - [4096, 3529, 1, 1024] + - [13, 85.971] + - - [4096, 3386, 1, 1024] + - [13, 85.822] + - - [4096, 3276, 1, 1024] + - [15, 84.261] + - - [1024, 3244, 1, 4096] + - [32, 83.914] + - - [1024, 3182, 1, 4096] + - [32, 82.47] + - - [4096, 3540, 1, 1024] + - [13, 86.187] + - - [1024, 3360, 1, 4096] + - [13, 74.723] + - - [1024, 3942, 1, 4096] + - [13, 79.221] + - - [4096, 3403, 1, 1024] + - [13, 86.007] + - - [4096, 3101, 1, 1024] + - [13, 82.61] + - - [4096, 2918, 1, 1024] + - [13, 84.667] + - - [1024, 3465, 1, 4096] + - [13, 67.919] + - - [33708, 3780, 1, 1024] + - [51, 88.403] + - - [4096, 3557, 1, 1024] + - [13, 86.544] + - - [4096, 3414, 1, 1024] + - [13, 86.657] + - - [1024, 3948, 1, 1024] + - [30, 82.019] + - - [4096, 3320, 1, 1024] + - [15, 85.637] + - - [4096, 2765, 1, 1024] + - [15, 84.879] + - - [1024, 3978, 1, 4096] + - [32, 78.107] + - - [4096, 3487, 1, 1024] + - [13, 84.825] + - - [4096, 3520, 1, 1024] + - [13, 85.736] + - - [1024, 3139, 1, 4096] + - [51, 81.396] + - - [1024, 3314, 1, 4096] + - [32, 85.678] + - - [4096, 3431, 1, 1024] + - [13, 86.571] + - - [1024, 3446, 1, 4096] + - [13, 78.748] + - - [1024, 4059, 1, 4096] + - [32, 79.695] + - - [4096, 3345, 1, 1024] + - [13, 84.834] + - - [4096, 3394, 1, 1024] + - [13, 86.021] + - - [1024, 3927, 1, 1024] + - [30, 81.617] + - - [4096, 3235, 1, 1024] + - [15, 83.444] + - - [1024, 3328, 1, 4096] + - [15, 86.354] + - - [33708, 3956, 1, 1024] + - [49, 88.601] + - - [4096, 3467, 1, 1024] + - [13, 84.265] + - - [1024, 3287, 1, 4096] + - [51, 84.902] + - - [4096, 3214, 1, 1024] + - [15, 82.722] + - - [4096, 3910, 1, 1024] + - [13, 85.678] + - - [1024, 3780, 1, 1024] + - [30, 78.522] + - - [1024, 3371, 1, 4096] + - [14, 75.589] + - - [4096, 3478, 1, 1024] + - [13, 84.599] + - - [1024, 3546, 1, 4096] + - [13, 70.694] + - - [1024, 4012, 1, 1024] + - [32, 76.068] + - - [4096, 3341, 1, 1024] + - [13, 84.514] + - - [4096, 3454, 1, 1024] + - [13, 87.446] + - - [4096, 3295, 1, 1024] + - [15, 84.87] + - - [4096, 3072, 1, 1024] + - [15, 86.467] + - - [1024, 3282, 1, 4096] + - [15, 84.766] + - - [33708, 3720, 1, 1024] + - [51, 87.13] + - - [1024, 3681, 1, 4096] + - [13, 75.671] + - - [1024, 4050, 1, 4096] + - [32, 79.546] + - - [4096, 3495, 1, 1024] + - [13, 85.217] + - - [4096, 3560, 1, 1024] + - [13, 86.972] + - - [4096, 3751, 1, 1024] + - [13, 85.118] + - - [1024, 3414, 1, 4096] + - [31, 76.812] + - - [33708, 3860, 1, 1024] + - [49, 86.589] + - - [1024, 3325, 1, 4096] + - [15, 85.308] + - - [4096, 3458, 1, 1024] + - [13, 84.085] + - - [4096, 2967, 1, 1024] + - [15, 83.264] + - - [1024, 3519, 1, 4096] + - [13, 71.881] + - - [4096, 3385, 1, 1024] + - [13, 85.312] + - - [4096, 3434, 1, 1024] + - [13, 87.004] + - - [1024, 3552, 1, 4096] + - [13, 73.049] + - - [4096, 3822, 1, 1024] + - [13, 86.363] + - - [1024, 3544, 1, 4096] + - [13, 72.58] + - - [4096, 3539, 1, 1024] + - [13, 85.917] + - - [4096, 3332, 1, 1024] + - [13, 84.55] + - - [1024, 3145, 1, 4096] + - [51, 81.54] + - - [1024, 3535, 1, 4096] + - [13, 75.616] + - - [1024, 3320, 1, 4096] + - [32, 85.804] + - - [33708, 4012, 1, 1024] + - [32, 88.191] + - - [4096, 3286, 1, 1024] + - [15, 84.369] + - - [1024, 3514, 1, 4096] + - [13, 71.475] + - - [1024, 2765, 1, 4096] + - [13, 79.361] + - - [1024, 3452, 1, 4096] + - [13, 79.054] + - - [4096, 3518, 1, 1024] + - [13, 85.52] + - - [1024, 3529, 1, 4096] + - [13, 70.324] + - - [4096, 3413, 1, 1024] + - [13, 86.043] + - - [33708, 4050, 1, 1024] + - [51, 88.773] + - - [1024, 3525, 1, 4096] + - [30, 75.219] + - - [4096, 3303, 1, 1024] + - [15, 84.811] + - - [1024, 3382, 1, 4096] + - [30, 79.465] + - - [1024, 3390, 1, 4096] + - [30, 79.14] + - - [1024, 3977, 1, 4096] + - [32, 78.211] + - - [1024, 3184, 1, 4096] + - [32, 82.65] + - - [4096, 3535, 1, 1024] + - [13, 86.066] + - - [4096, 3376, 1, 1024] + - [13, 85.623] + - - [4096, 3978, 1, 1024] + - [13, 84.414] + - - [1024, 3136, 1, 4096] + - [51, 81.315] + - - [1024, 3293, 1, 4096] + - [32, 85.168] + - - [4096, 3266, 1, 1024] + - [15, 84.08] + - - [1024, 3487, 1, 4096] + - [13, 70.699] + - - [1024, 3409, 1, 4096] + - [30, 73.063] + - - [4096, 3498, 1, 1024] + - [13, 85.141] + - - [1024, 3520, 1, 4096] + - [13, 73.586] + - - [1024, 3530, 1, 4096] + - [14, 68.79] + - - [4096, 3393, 1, 1024] + - [13, 85.867] + - - [4096, 3140, 1, 1024] + - [13, 83.656] + - - [1024, 3536, 1, 4096] + - [13, 71.633] + - - [1024, 3288, 1, 4096] + - [15, 84.969] + - - [1024, 4005, 1, 4096] + - [51, 78.684] + - - [1024, 3579, 1, 4096] + - [13, 74.001] + - - [4096, 3372, 1, 1024] + - [13, 85.781] + - - [1024, 3440, 1, 4096] + - [30, 84.82] + - - [4096, 3213, 1, 1024] + - [15, 82.74] + - - [4096, 3477, 1, 1024] + - [13, 84.784] + - - [4096, 3526, 1, 1024] + - [13, 85.962] + - - [1024, 3493, 1, 4096] + - [13, 73.004] + - - [1024, 3944, 1, 4096] + - [13, 82.222] + - - [4096, 3453, 1, 1024] + - [13, 87.496] + - - [1024, 3350, 1, 4096] + - [13, 78.73] + - - [4096, 3184, 1, 1024] + - [13, 84.847] + - - [1024, 3423, 1, 4096] + - [13, 77.814] + - - [4096, 3351, 1, 1024] + - [13, 84.884] + - - [4096, 3416, 1, 1024] + - [13, 86.607] + - - [1024, 3796, 1, 4096] + - [30, 74.994] + - - [4096, 3257, 1, 1024] + - [15, 83.882] + - - [4096, 3306, 1, 1024] + - [15, 85.285] + - - [33708, 4020, 1, 1024] + - [51, 88.06] + - - [1024, 3426, 1, 4096] + - [30, 78.346] + - - [4096, 3457, 1, 1024] + - [13, 84.135] + - - [1024, 2935, 1, 4096] + - [51, 76.167] + - - [1024, 3046, 1, 4096] + - [15, 79.041] + - - [4096, 3433, 1, 1024] + - [13, 86.918] + - - [1024, 3256, 1, 4096] + - [51, 84.171] + - - [1024, 3531, 1, 4096] + - [13, 74.579] + - - [4096, 3180, 1, 1024] + - [13, 84.514] + - - [1024, 3388, 1, 4096] + - [13, 82.98] + - - [4096, 3444, 1, 1024] + - [13, 87.121] + - - [1024, 3501, 1, 4096] + - [30, 69.602] + - - [1024, 3266, 1, 4096] + - [32, 84.568] + - - [1024, 3267, 1, 4096] + - [32, 84.509] + - - [1024, 3461, 1, 4096] + - [13, 71.772] + - - [4096, 3870, 1, 1024] + - [13, 84.649] + - - [4096, 3517, 1, 1024] + - [13, 85.881] + - - [1024, 3566, 1, 4096] + - [30, 72.679] + - - [4096, 3574, 1, 1024] + - [13, 86.923] + - - [1024, 3876, 1, 1024] + - [30, 80.44] + - - [4096, 3720, 1, 1024] + - [13, 84.252] + - - [4096, 3248, 1, 1024] + - [15, 83.914] + - - [4096, 4059, 1, 1024] + - [13, 85.885] + - - [1024, 3380, 1, 4096] + - [30, 71.245] + - - [4096, 3480, 1, 1024] + - [13, 84.838] + - - [1024, 3335, 1, 4096] + - [31, 75.31] + - - [1024, 3345, 1, 4096] + - [13, 82.086] + - - [4096, 3391, 1, 1024] + - [13, 85.998] + - - [4096, 3424, 1, 1024] + - [13, 86.706] + - - [1024, 3394, 1, 4096] + - [13, 83.616] + - - [4096, 3265, 1, 1024] + - [15, 84.148] + - - [1024, 3014, 1, 4096] + - [15, 78.215] + - - [4096, 3497, 1, 1024] + - [13, 85.073] + - - [4096, 3354, 1, 1024] + - [13, 85.15] + - - [4096, 3055, 1, 1024] + - [15, 85.587] + - - [1024, 3499, 1, 4096] + - [13, 74.371] + - - [1024, 3162, 1, 4096] + - [32, 81.996] + - - [4096, 3244, 1, 1024] + - [15, 83.584] + - - [1024, 3437, 1, 4096] + - [31, 77.042] + - - [1024, 3356, 1, 4096] + - [31, 74.692] + - - [4096, 3139, 1, 1024] + - [13, 83.422] + - - [4096, 3508, 1, 1024] + - [13, 85.488] + - - [1024, 3235, 1, 4096] + - [32, 83.611] + - - [1024, 3910, 1, 4096] + - [32, 76.934] + - - [4096, 3371, 1, 1024] + - [13, 85.272] + - - [1024, 3751, 1, 4096] + - [13, 74.028] + - - [4096, 3325, 1, 1024] + - [15, 85.425] + - - [1024, 3413, 1, 4096] + - [13, 76.492] + - - [1024, 3542, 1, 4096] + - [30, 69.287] + - - [33708, 3900, 1, 1024] + - [30, 87.234] + - - [4096, 3525, 1, 1024] + - [13, 85.714] + - - [4096, 3382, 1, 1024] + - [13, 85.493] + - - [1024, 3339, 1, 4096] + - [14, 74.831] + - - [4096, 3288, 1, 1024] + - [15, 84.586] + - - [1024, 3141, 1, 4096] + - [15, 81.355] + - - [1024, 3168, 1, 4096] + - [51, 82.095] + - - [4096, 3488, 1, 1024] + - [13, 84.726] + - - [4096, 3046, 1, 1024] + - [15, 85.105] + - - [1024, 3362, 1, 4096] + - [13, 76.731] + - - [33708, 3942, 1, 1024] + - [49, 88.317] + - - [4096, 3399, 1, 1024] + - [13, 85.953] + - - [1024, 3720, 1, 1024] + - [30, 77.322] + - - [4096, 3563, 1, 1024] + - [13, 87.045] + - - [1024, 3273, 1, 4096] + - [51, 84.586] + - - [4096, 3162, 1, 1024] + - [13, 84.198] + - - [1024, 3467, 1, 4096] + - [30, 73.992] + - - [1024, 3130, 1, 4096] + - [15, 81.17] + - - [1024, 3405, 1, 4096] + - [30, 77.949] + - - [4096, 3362, 1, 1024] + - [13, 85.398] + - - [1024, 3960, 1, 1024] + - [30, 82.276] + - - [1024, 3712, 1, 36548] + - [1, 81.725] + - - [1024, 3712, 1, 1024] + - [30, 77.547] + - - [4032, 384, 1, 64] + - [0, 47.662] + - - [1024, 2048, 1, 49] + - [8, 51.059] + - - [4608, 512, 1, 49] + - [36, 51.948] + - - [9216, 512, 1, 4096] + - [32, 82.966] + - - [3456, 384, 1, 289] + - [36, 63.615] + - - [3456, 384, 1, 169] + - [36, 61.477] + - - [4096, 512, 1, 1001] + - [7, 76.244] + - - [384, 448, 49, 512] + - [49, 72.323] + - - [384, 448, 64, 256] + - [13, 71.073] + - - [384, 448, 36, 256] + - [12, 68.714] + - - [384, 448, 49, 256] + - [29, 70.55] + - - [384, 448, 64, 512] + - [49, 72.909] + - - [384, 448, 36, 512] + - [13, 73.131] + - - [1024, 6400, 1, 65] + - [0, 70.193] + - - [4096, 6400, 1, 256] + - [13, 86.061] + - - [512, 3194, 1, 2048] + - [49, 71.398] + - - [512, 3222, 1, 2048] + - [30, 74.046] + - - [512, 3234, 1, 2048] + - [30, 74.29] + - - [512, 3242, 1, 2048] + - [30, 74.331] + - - [512, 3257, 1, 2048] + - [30, 74.886] + - - [512, 3332, 1, 2048] + - [31, 73.943] + - - [512, 3336, 1, 2048] + - [14, 74.128] + - - [512, 3378, 1, 2048] + - [50, 74.696] + - - [512, 3396, 1, 2048] + - [50, 75.201] + - - [512, 3399, 1, 2048] + - [31, 75.283] + - - [512, 3451, 1, 2048] + - [31, 76.284] + - - [512, 3456, 1, 2048] + - [31, 76.862] + - - [512, 3458, 1, 2048] + - [51, 65.876] + - - [512, 3467, 1, 2048] + - [32, 66.011] + - - [512, 3468, 1, 2048] + - [51, 65.979] + - - [512, 3470, 1, 2048] + - [15, 66.079] + - - [512, 3477, 1, 2048] + - [15, 66.259] + - - [512, 3478, 1, 2048] + - [15, 66.182] + - - [512, 3495, 1, 2048] + - [15, 66.566] + - - [512, 3507, 1, 2048] + - [15, 66.769] + - - [512, 3515, 1, 2048] + - [15, 66.909] + - - [512, 3517, 1, 2048] + - [51, 66.927] + - - [2048, 2864, 1, 512] + - [13, 76.487] + - - [2048, 3287, 1, 512] + - [13, 80.498] + - - [2048, 3412, 1, 512] + - [30, 83.264] + - - [2048, 3456, 1, 512] + - [13, 85.317] + - - [2048, 3466, 1, 512] + - [30, 79.451] + - - [2048, 3476, 1, 512] + - [30, 79.573] + - - [2048, 3999, 1, 512] + - [30, 80.458] + - - [33708, 189, 1, 512] + - [29, 70.333] + - - [33708, 2496, 1, 512] + - [13, 86.458] + - - [33708, 3864, 1, 512] + - [13, 86.828] + - - [33708, 3969, 1, 512] + - [13, 86.413] + - - [33708, 3995, 1, 512] + - [13, 86.914] + - - [134, 134, 240, 64] + - [17, 27.061] + - - [135, 134, 240, 64] + - [36, 27.688] + - - [135, 135, 240, 64] + - [25, 27.675] + - - [512, 2790, 1, 2048] + - [48, 66.53] + - - [512, 2864, 1, 2048] + - [30, 66.07] + - - [512, 3092, 1, 2048] + - [13, 71.181] + - - [512, 3113, 1, 2048] + - [30, 71.524] + - - [512, 3137, 1, 2048] + - [13, 72.084] + - - [512, 3165, 1, 2048] + - [13, 72.666] + - - [512, 3166, 1, 2048] + - [49, 72.675] + - - [512, 3219, 1, 2048] + - [13, 74.046] + - - [512, 3237, 1, 2048] + - [49, 74.317] + - - [512, 3246, 1, 2048] + - [13, 74.371] + - - [512, 3249, 1, 2048] + - [30, 74.678] + - - [512, 3251, 1, 2048] + - [49, 74.944] + - - [512, 3262, 1, 2048] + - [30, 75.016] + - - [512, 3268, 1, 2048] + - [13, 75.161] + - - [512, 3282, 1, 2048] + - [13, 75.314] + - - [512, 3286, 1, 2048] + - [30, 75.698] + - - [512, 3287, 1, 2048] + - [30, 75.531] + - - [512, 3293, 1, 2048] + - [30, 75.684] + - - [512, 3297, 1, 2048] + - [30, 75.644] + - - [512, 3307, 1, 2048] + - [13, 75.977] + - - [512, 3314, 1, 2048] + - [49, 76.027] + - - [512, 3315, 1, 2048] + - [49, 76.05] + - - [512, 3319, 1, 2048] + - [30, 76.054] + - - [512, 3322, 1, 2048] + - [49, 76.559] + - - [512, 3323, 1, 2048] + - [30, 76.329] + - - [512, 3324, 1, 2048] + - [49, 76.189] + - - [512, 3325, 1, 2048] + - [49, 76.316] + - - [512, 3327, 1, 2048] + - [49, 76.442] + - - [512, 3329, 1, 2048] + - [31, 73.92] + - - [512, 3339, 1, 2048] + - [50, 74.051] + - - [512, 3342, 1, 2048] + - [31, 74.164] + - - [512, 3344, 1, 2048] + - [50, 74.222] + - - [512, 3358, 1, 2048] + - [14, 74.637] + - - [512, 3360, 1, 2048] + - [50, 74.714] + - - [512, 3364, 1, 2048] + - [31, 74.633] + - - [512, 3365, 1, 2048] + - [31, 74.678] + - - [512, 3369, 1, 2048] + - [50, 74.786] + - - [512, 3371, 1, 2048] + - [50, 74.922] + - - [512, 3374, 1, 2048] + - [31, 74.764] + - - [512, 3376, 1, 2048] + - [31, 74.989] + - - [512, 3377, 1, 2048] + - [50, 74.759] + - - [512, 3381, 1, 2048] + - [50, 75.057] + - - [512, 3382, 1, 2048] + - [31, 75.129] + - - [512, 3383, 1, 2048] + - [31, 75.048] + - - [512, 3384, 1, 2048] + - [31, 75.17] + - - [512, 3385, 1, 2048] + - [50, 75.134] + - - [512, 3386, 1, 2048] + - [50, 75.057] + - - [512, 3388, 1, 2048] + - [50, 75.179] + - - [512, 3390, 1, 2048] + - [50, 75.188] + - - [512, 3391, 1, 2048] + - [31, 75.35] + - - [512, 3402, 1, 2048] + - [31, 75.391] + - - [512, 3410, 1, 2048] + - [31, 75.616] + - - [512, 3412, 1, 2048] + - [14, 75.612] + - - [512, 3414, 1, 2048] + - [50, 75.684] + - - [512, 3415, 1, 2048] + - [50, 75.752] + - - [512, 3418, 1, 2048] + - [14, 75.761] + - - [512, 3420, 1, 2048] + - [14, 75.774] + - - [512, 3422, 1, 2048] + - [50, 75.928] + - - [512, 3425, 1, 2048] + - [31, 75.941] + - - [512, 3426, 1, 2048] + - [31, 75.851] + - - [512, 3427, 1, 2048] + - [31, 75.797] + - - [512, 3428, 1, 2048] + - [50, 75.869] + - - [512, 3430, 1, 2048] + - [14, 76.004] + - - [512, 3431, 1, 2048] + - [50, 75.842] + - - [512, 3432, 1, 2048] + - [50, 76.004] + - - [512, 3438, 1, 2048] + - [50, 76.059] + - - [512, 3439, 1, 2048] + - [50, 76.198] + - - [512, 3440, 1, 2048] + - [31, 76.149] + - - [512, 3443, 1, 2048] + - [31, 76.271] + - - [512, 3445, 1, 2048] + - [31, 76.185] + - - [512, 3447, 1, 2048] + - [31, 76.325] + - - [512, 3448, 1, 2048] + - [31, 76.374] + - - [512, 3450, 1, 2048] + - [50, 76.42] + - - [512, 3452, 1, 2048] + - [31, 76.379] + - - [512, 3453, 1, 2048] + - [31, 76.501] + - - [512, 3455, 1, 2048] + - [31, 76.492] + - - [512, 3457, 1, 2048] + - [15, 65.781] + - - [512, 3459, 1, 2048] + - [15, 65.925] + - - [512, 3460, 1, 2048] + - [51, 65.934] + - - [512, 3461, 1, 2048] + - [15, 65.894] + - - [512, 3462, 1, 2048] + - [51, 65.948] + - - [512, 3466, 1, 2048] + - [32, 66.016] + - - [512, 3471, 1, 2048] + - [51, 65.975] + - - [512, 3472, 1, 2048] + - [32, 65.993] + - - [512, 3475, 1, 2048] + - [15, 66.237] + - - [512, 3476, 1, 2048] + - [15, 66.124] + - - [512, 3479, 1, 2048] + - [51, 66.25] + - - [512, 3480, 1, 2048] + - [51, 66.219] + - - [512, 3481, 1, 2048] + - [51, 66.264] + - - [512, 3483, 1, 2048] + - [51, 66.395] + - - [512, 3484, 1, 2048] + - [32, 66.3] + - - [512, 3487, 1, 2048] + - [51, 66.367] + - - [512, 3489, 1, 2048] + - [15, 66.413] + - - [512, 3490, 1, 2048] + - [51, 66.367] + - - [512, 3491, 1, 2048] + - [15, 66.498] + - - [512, 3493, 1, 2048] + - [51, 66.58] + - - [512, 3494, 1, 2048] + - [51, 66.476] + - - [512, 3497, 1, 2048] + - [15, 66.62] + - - [512, 3498, 1, 2048] + - [51, 66.534] + - - [512, 3499, 1, 2048] + - [51, 66.634] + - - [512, 3501, 1, 2048] + - [51, 66.634] + - - [512, 3503, 1, 2048] + - [51, 66.616] + - - [512, 3508, 1, 2048] + - [15, 66.764] + - - [512, 3509, 1, 2048] + - [15, 66.67] + - - [512, 3511, 1, 2048] + - [51, 66.755] + - - [512, 3514, 1, 2048] + - [15, 66.958] + - - [512, 3518, 1, 2048] + - [15, 67.044] + - - [512, 3519, 1, 2048] + - [15, 67.031] + - - [512, 3520, 1, 2048] + - [15, 67.098] + - - [512, 3523, 1, 2048] + - [15, 67.107] + - - [512, 3528, 1, 2048] + - [15, 67.035] + - - [512, 3529, 1, 2048] + - [15, 67.044] + - - [512, 3530, 1, 2048] + - [15, 67.18] + - - [512, 3532, 1, 2048] + - [51, 67.378] + - - [512, 3533, 1, 2048] + - [15, 67.279] + - - [512, 3534, 1, 2048] + - [15, 67.265] + - - [512, 3538, 1, 2048] + - [15, 67.378] + - - [512, 3539, 1, 2048] + - [32, 67.365] + - - [512, 3541, 1, 2048] + - [15, 67.319] + - - [512, 3547, 1, 2048] + - [15, 67.477] + - - [512, 3548, 1, 2048] + - [15, 67.423] + - - [512, 3552, 1, 2048] + - [15, 67.577] + - - [512, 3564, 1, 2048] + - [15, 67.789] + - - [512, 3575, 1, 2048] + - [15, 67.847] + - - [512, 3598, 1, 2048] + - [51, 68.402] + - - [512, 3599, 1, 2048] + - [32, 68.483] + - - [512, 3608, 1, 2048] + - [51, 68.623] + - - [512, 3780, 1, 512] + - [15, 66.173] + - - [512, 3780, 1, 2048] + - [32, 71.985] + - - [512, 3796, 1, 512] + - [15, 66.025] + - - [512, 3796, 1, 2048] + - [51, 72.048] + - - [512, 3822, 1, 512] + - [15, 66.552] + - - [512, 3822, 1, 2048] + - [15, 72.652] + - - [512, 3840, 1, 512] + - [32, 70.067] + - - [512, 3840, 1, 2048] + - [32, 73.78] + - - [512, 3859, 1, 512] + - [11, 68.019] + - - [512, 3859, 1, 2048] + - [15, 73.207] + - - [512, 3870, 1, 512] + - [11, 67.969] + - - [512, 3870, 1, 2048] + - [51, 73.491] + - - [512, 3876, 1, 512] + - [11, 68.321] + - - [512, 3876, 1, 2048] + - [32, 73.595] + - - [512, 3906, 1, 512] + - [28, 68.438] + - - [512, 3906, 1, 2048] + - [15, 74.064] + - - [512, 3910, 1, 512] + - [15, 68.249] + - - [512, 3910, 1, 2048] + - [15, 74.132] + - - [512, 3925, 1, 512] + - [11, 68.65] + - - [512, 3925, 1, 2048] + - [15, 74.398] + - - [512, 3927, 1, 512] + - [51, 68.705] + - - [512, 3942, 1, 512] + - [11, 68.542] + - - [512, 3942, 1, 2048] + - [32, 74.782] + - - [512, 3944, 1, 512] + - [11, 68.962] + - - [512, 3944, 1, 2048] + - [15, 74.809] + - - [512, 3955, 1, 512] + - [51, 68.79] + - - [512, 3955, 1, 2048] + - [32, 75.089] + - - [512, 3968, 1, 512] + - [26, 70.193] + - - [512, 3968, 1, 2048] + - [51, 75.283] + - - [512, 3969, 1, 512] + - [11, 69.359] + - - [512, 3969, 1, 2048] + - [15, 75.197] + - - [512, 3976, 1, 512] + - [11, 69.521] + - - [512, 3976, 1, 2048] + - [15, 75.332] + - - [512, 3977, 1, 512] + - [11, 69.643] + - - [512, 3977, 1, 2048] + - [15, 75.395] + - - [512, 3978, 1, 512] + - [15, 69.228] + - - [512, 3978, 1, 2048] + - [15, 75.337] + - - [512, 3990, 1, 512] + - [51, 69.417] + - - [512, 3990, 1, 2048] + - [15, 75.68] + - - [512, 3995, 1, 512] + - [32, 69.282] + - - [512, 3995, 1, 2048] + - [15, 75.792] + - - [512, 3996, 1, 512] + - [51, 69.462] + - - [512, 3996, 1, 2048] + - [51, 75.756] + - - [512, 3999, 1, 512] + - [11, 70.175] + - - [512, 3999, 1, 2048] + - [15, 75.68] + - - [512, 4005, 1, 512] + - [51, 69.72] + - - [512, 4005, 1, 2048] + - [15, 75.779] + - - [512, 4012, 1, 512] + - [51, 69.656] + - - [512, 4012, 1, 2048] + - [15, 76.05] + - - [512, 4020, 1, 512] + - [26, 69.918] + - - [512, 4020, 1, 2048] + - [51, 76.126] + - - [512, 4026, 1, 512] + - [11, 70.148] + - - [512, 4026, 1, 2048] + - [15, 76.302] + - - [512, 4030, 1, 512] + - [11, 70.216] + - - [512, 4030, 1, 2048] + - [15, 76.447] + - - [512, 4032, 1, 512] + - [28, 70.026] + - - [512, 4032, 1, 2048] + - [15, 76.379] + - - [512, 4050, 1, 512] + - [51, 70.505] + - - [512, 4059, 1, 512] + - [15, 70.41] + - - [2048, 2790, 1, 512] + - [13, 80.818] + - - [2048, 3092, 1, 512] + - [15, 76.379] + - - [2048, 3113, 1, 512] + - [30, 77.178] + - - [2048, 3137, 1, 512] + - [13, 77.76] + - - [2048, 3165, 1, 512] + - [30, 78.206] + - - [2048, 3166, 1, 512] + - [30, 78.436] + - - [2048, 3194, 1, 512] + - [30, 79.005] + - - [2048, 3219, 1, 512] + - [30, 78.847] + - - [2048, 3222, 1, 512] + - [30, 79.014] + - - [2048, 3234, 1, 512] + - [15, 79.248] + - - [2048, 3237, 1, 512] + - [30, 79.659] + - - [2048, 3242, 1, 512] + - [30, 79.749] + - - [2048, 3246, 1, 512] + - [30, 79.51] + - - [2048, 3249, 1, 512] + - [30, 80.038] + - - [2048, 3251, 1, 512] + - [30, 80.006] + - - [2048, 3257, 1, 512] + - [15, 79.93] + - - [2048, 3262, 1, 512] + - [30, 80.061] + - - [2048, 3268, 1, 512] + - [30, 80.313] + - - [2048, 3282, 1, 512] + - [30, 80.467] + - - [2048, 3286, 1, 512] + - [30, 80.724] + - - [2048, 3293, 1, 512] + - [30, 80.927] + - - [2048, 3297, 1, 512] + - [30, 81.003] + - - [2048, 3307, 1, 512] + - [30, 81.071] + - - [2048, 3314, 1, 512] + - [13, 80.832] + - - [2048, 3315, 1, 512] + - [30, 81.486] + - - [2048, 3319, 1, 512] + - [30, 81.256] + - - [2048, 3322, 1, 512] + - [30, 81.319] + - - [2048, 3323, 1, 512] + - [30, 81.134] + - - [2048, 3324, 1, 512] + - [30, 81.64] + - - [2048, 3325, 1, 512] + - [30, 81.387] + - - [2048, 3327, 1, 512] + - [13, 81.333] + - - [2048, 3329, 1, 512] + - [13, 81.148] + - - [2048, 3332, 1, 512] + - [13, 81.387] + - - [2048, 3336, 1, 512] + - [30, 81.73] + - - [2048, 3339, 1, 512] + - [13, 81.225] + - - [2048, 3342, 1, 512] + - [30, 81.558] + - - [2048, 3344, 1, 512] + - [13, 81.667] + - - [2048, 3358, 1, 512] + - [30, 82.195] + - - [2048, 3360, 1, 512] + - [30, 82.294] + - - [2048, 3364, 1, 512] + - [13, 82.154] + - - [2048, 3365, 1, 512] + - [30, 82.1] + - - [2048, 3369, 1, 512] + - [30, 82.393] + - - [2048, 3371, 1, 512] + - [30, 82.443] + - - [2048, 3374, 1, 512] + - [13, 82.348] + - - [2048, 3376, 1, 512] + - [30, 82.695] + - - [2048, 3377, 1, 512] + - [30, 82.515] + - - [2048, 3378, 1, 512] + - [30, 82.777] + - - [2048, 3381, 1, 512] + - [30, 82.655] + - - [2048, 3382, 1, 512] + - [30, 82.745] + - - [2048, 3383, 1, 512] + - [30, 82.677] + - - [2048, 3384, 1, 512] + - [30, 83.106] + - - [2048, 3385, 1, 512] + - [30, 82.853] + - - [2048, 3386, 1, 512] + - [13, 82.501] + - - [2048, 3388, 1, 512] + - [30, 83.101] + - - [2048, 3390, 1, 512] + - [30, 82.975] + - - [2048, 3391, 1, 512] + - [30, 82.957] + - - [2048, 3396, 1, 512] + - [30, 83.192] + - - [2048, 3399, 1, 512] + - [13, 82.898] + - - [2048, 3402, 1, 512] + - [30, 83.435] + - - [2048, 3410, 1, 512] + - [13, 83.25] + - - [2048, 3414, 1, 512] + - [30, 83.868] + - - [2048, 3415, 1, 512] + - [30, 83.769] + - - [2048, 3418, 1, 512] + - [13, 83.566] + - - [2048, 3420, 1, 512] + - [30, 83.674] + - - [2048, 3422, 1, 512] + - [30, 83.796] + - - [2048, 3425, 1, 512] + - [30, 83.711] + - - [2048, 3426, 1, 512] + - [30, 83.652] + - - [2048, 3427, 1, 512] + - [13, 83.58] + - - [2048, 3428, 1, 512] + - [30, 83.81] + - - [2048, 3430, 1, 512] + - [13, 83.539] + - - [2048, 3431, 1, 512] + - [30, 84.058] + - - [2048, 3432, 1, 512] + - [13, 83.674] + - - [2048, 3438, 1, 512] + - [30, 83.692] + - - [2048, 3439, 1, 512] + - [13, 83.882] + - - [2048, 3440, 1, 512] + - [13, 83.647] + - - [2048, 3443, 1, 512] + - [13, 83.738] + - - [2048, 3445, 1, 512] + - [30, 83.742] + - - [2048, 3447, 1, 512] + - [30, 84.302] + - - [2048, 3448, 1, 512] + - [30, 84.089] + - - [2048, 3450, 1, 512] + - [30, 83.864] + - - [2048, 3451, 1, 512] + - [30, 84.184] + - - [2048, 3452, 1, 512] + - [30, 84.284] + - - [2048, 3453, 1, 512] + - [30, 84.094] + - - [2048, 3455, 1, 512] + - [30, 84.514] + - - [2048, 3457, 1, 512] + - [30, 78.991] + - - [2048, 3458, 1, 512] + - [30, 79.077] + - - [2048, 3459, 1, 512] + - [30, 79.077] + - - [2048, 3460, 1, 512] + - [30, 78.689] + - - [2048, 3461, 1, 512] + - [30, 79.185] + - - [2048, 3462, 1, 512] + - [13, 78.657] + - - [2048, 3467, 1, 512] + - [30, 79.244] + - - [2048, 3468, 1, 512] + - [30, 79.406] + - - [2048, 3470, 1, 512] + - [30, 79.537] + - - [2048, 3471, 1, 512] + - [30, 79.054] + - - [2048, 3472, 1, 512] + - [30, 79.384] + - - [2048, 3475, 1, 512] + - [30, 79.312] + - - [2048, 3477, 1, 512] + - [30, 79.506] + - - [2048, 3478, 1, 512] + - [30, 80.083] + - - [2048, 3479, 1, 512] + - [13, 79.077] + - - [2048, 3480, 1, 512] + - [30, 79.605] + - - [2048, 3481, 1, 512] + - [30, 79.573] + - - [2048, 3483, 1, 512] + - [30, 79.366] + - - [2048, 3484, 1, 512] + - [30, 79.695] + - - [2048, 3487, 1, 512] + - [30, 79.6] + - - [2048, 3489, 1, 512] + - [30, 79.379] + - - [2048, 3490, 1, 512] + - [30, 79.916] + - - [2048, 3491, 1, 512] + - [30, 79.433] + - - [2048, 3493, 1, 512] + - [30, 80.133] + - - [2048, 3494, 1, 512] + - [30, 79.993] + - - [2048, 3495, 1, 512] + - [30, 80.006] + - - [2048, 3497, 1, 512] + - [30, 80.083] + - - [2048, 3498, 1, 512] + - [30, 80.015] + - - [2048, 3499, 1, 512] + - [30, 79.83] + - - [2048, 3501, 1, 512] + - [13, 79.636] + - - [2048, 3503, 1, 512] + - [30, 80.097] + - - [2048, 3507, 1, 512] + - [30, 80.029] + - - [2048, 3508, 1, 512] + - [30, 80.498] + - - [2048, 3509, 1, 512] + - [30, 80.615] + - - [2048, 3511, 1, 512] + - [30, 80.498] + - - [2048, 3514, 1, 512] + - [30, 80.164] + - - [2048, 3515, 1, 512] + - [30, 80.467] + - - [2048, 3517, 1, 512] + - [30, 80.403] + - - [2048, 3518, 1, 512] + - [30, 80.512] + - - [2048, 3519, 1, 512] + - [30, 80.525] + - - [2048, 3520, 1, 512] + - [30, 80.471] + - - [2048, 3523, 1, 512] + - [30, 80.733] + - - [2048, 3528, 1, 512] + - [30, 80.593] + - - [2048, 3529, 1, 512] + - [30, 80.593] + - - [2048, 3530, 1, 512] + - [30, 80.787] + - - [2048, 3532, 1, 512] + - [30, 80.692] + - - [2048, 3533, 1, 512] + - [30, 80.823] + - - [2048, 3534, 1, 512] + - [30, 80.34] + - - [2048, 3538, 1, 512] + - [30, 80.688] + - - [2048, 3539, 1, 512] + - [30, 80.665] + - - [2048, 3541, 1, 512] + - [30, 80.647] + - - [2048, 3547, 1, 512] + - [30, 81.184] + - - [2048, 3548, 1, 512] + - [30, 80.597] + - - [2048, 3552, 1, 512] + - [30, 81.085] + - - [2048, 3564, 1, 512] + - [30, 81.477] + - - [2048, 3575, 1, 512] + - [30, 81.734] + - - [2048, 3598, 1, 512] + - [30, 81.419] + - - [2048, 3599, 1, 512] + - [13, 81.495] + - - [2048, 3608, 1, 512] + - [13, 81.649] + - - [2048, 3780, 1, 512] + - [30, 80.399] + - - [2048, 3796, 1, 512] + - [13, 80.746] + - - [2048, 3822, 1, 512] + - [30, 80.918] + - - [2048, 3840, 1, 512] + - [15, 82.628] + - - [2048, 3859, 1, 512] + - [30, 81.883] + - - [2048, 3870, 1, 512] + - [30, 82.005] + - - [2048, 3876, 1, 512] + - [30, 82.05] + - - [2048, 3906, 1, 512] + - [30, 82.916] + - - [2048, 3910, 1, 512] + - [13, 82.515] + - - [2048, 3925, 1, 512] + - [13, 82.844] + - - [2048, 3942, 1, 512] + - [13, 83.106] + - - [2048, 3944, 1, 512] + - [30, 83.53] + - - [2048, 3955, 1, 512] + - [30, 83.747] + - - [2048, 3968, 1, 512] + - [30, 84.468] + - - [2048, 3969, 1, 512] + - [30, 79.086] + - - [2048, 3976, 1, 512] + - [30, 79.456] + - - [2048, 3977, 1, 512] + - [30, 79.546] + - - [2048, 3978, 1, 512] + - [30, 79.447] + - - [2048, 3990, 1, 512] + - [30, 80.079] + - - [2048, 3995, 1, 512] + - [30, 79.876] + - - [2048, 3996, 1, 512] + - [30, 80.43] + - - [2048, 4005, 1, 512] + - [30, 80.061] + - - [2048, 4012, 1, 512] + - [30, 80.182] + - - [2048, 4020, 1, 512] + - [30, 80.259] + - - [2048, 4026, 1, 512] + - [30, 80.638] + - - [2048, 4030, 1, 512] + - [30, 80.471] + - - [2048, 4032, 1, 512] + - [30, 80.494] + - - [33708, 184, 1, 512] + - [48, 68.596] + - - [33708, 208, 1, 512] + - [13, 68.073] + - - [33708, 246, 1, 512] + - [49, 80.106] + - - [33708, 264, 1, 512] + - [30, 59.812] + - - [33708, 465, 1, 512] + - [49, 77.489] + - - [33708, 468, 1, 512] + - [49, 77.881] + - - [33708, 493, 1, 512] + - [30, 81.96] + - - [33708, 540, 1, 512] + - [30, 73.568] + - - [33708, 550, 1, 512] + - [30, 74.971] + - - [33708, 560, 1, 512] + - [30, 76.217] + - - [33708, 644, 1, 512] + - [49, 73.586] + - - [33708, 714, 1, 512] + - [49, 81.391] + - - [33708, 720, 1, 512] + - [49, 81.942] + - - [33708, 781, 1, 512] + - [49, 75.865] + - - [33708, 936, 1, 512] + - [49, 80.083] + - - [33708, 980, 1, 512] + - [30, 83.665] + - - [33708, 1232, 1, 512] + - [13, 84.153] + - - [33708, 1290, 1, 512] + - [13, 80.624] + - - [33708, 1350, 1, 512] + - [13, 84.297] + - - [33708, 1424, 1, 512] + - [13, 81.509] + - - [33708, 1458, 1, 512] + - [13, 83.674] + - - [33708, 1462, 1, 512] + - [13, 83.801] + - - [33708, 1520, 1, 512] + - [13, 86.959] + - - [33708, 1596, 1, 512] + - [13, 84.491] + - - [33708, 1599, 1, 512] + - [13, 84.604] + - - [33708, 1615, 1, 512] + - [13, 85.434] + - - [33708, 1680, 1, 512] + - [13, 82.772] + - - [33708, 1917, 1, 512] + - [13, 88.371] + - - [33708, 2205, 1, 512] + - [13, 84.897] + - - [33708, 2418, 1, 512] + - [13, 88.24] + - - [33708, 3776, 1, 512] + - [13, 87.771] + - - [33708, 3780, 1, 512] + - [13, 87.843] + - - [33708, 3796, 1, 512] + - [13, 88.24] + - - [33708, 3822, 1, 512] + - [13, 88.723] + - - [33708, 3835, 1, 512] + - [13, 89.102] + - - [33708, 3840, 1, 512] + - [13, 89.17] + - - [33708, 3859, 1, 512] + - [13, 86.756] + - - [33708, 3870, 1, 512] + - [13, 86.945] + - - [33708, 3876, 1, 512] + - [13, 87.085] + - - [33708, 3906, 1, 512] + - [13, 87.649] + - - [33708, 3910, 1, 512] + - [13, 87.852] + - - [33708, 3925, 1, 512] + - [13, 88.209] + - - [33708, 3942, 1, 512] + - [13, 88.556] + - - [33708, 3944, 1, 512] + - [13, 88.583] + - - [33708, 3955, 1, 512] + - [13, 88.885] + - - [33708, 3968, 1, 512] + - [30, 89.125] + - - [33708, 3976, 1, 512] + - [13, 86.652] + - - [33708, 3977, 1, 512] + - [13, 86.584] + - - [33708, 3978, 1, 512] + - [13, 86.436] + - - [33708, 3990, 1, 512] + - [13, 86.873] + - - [33708, 3996, 1, 512] + - [13, 86.991] + - - [33708, 3999, 1, 512] + - [13, 87.09] + - - [33708, 4005, 1, 512] + - [13, 87.293] + - - [33708, 4012, 1, 512] + - [13, 87.428] + - - [33708, 4020, 1, 512] + - [13, 87.586] + - - [33708, 4026, 1, 512] + - [13, 87.708] + - - [33708, 4030, 1, 512] + - [13, 87.83] + - - [33708, 4032, 1, 512] + - [13, 87.83] + - - [3072, 512, 1, 3072] + - [13, 71.795] + - - [511, 8192, 1, 8192] + - [51, 73.067] + - - [4096, 4096, 1, 4096] + - [13, 86.788] + - - [8192, 8193, 1, 8192] + - [51, 81.076] + - - [3072, 3072, 1, 3071] + - [24, 91.295] + - - [8192, 8192, 1, 8193] + - [7, 92.188] + - - [7681, 8192, 1, 8192] + - [51, 87.6] + - - [7680, 8192, 1, 8193] + - [24, 91.281] + - - [513, 4096, 1, 4096] + - [13, 59.212] + - - [3073, 512, 1, 3072] + - [13, 71.321] + - - [7680, 8192, 1, 8192] + - [51, 89.165] + - - [4096, 4096, 1, 4097] + - [41, 87.73] + - - [8192, 8191, 1, 8192] + - [51, 82.907] + - - [8192, 512, 1, 8193] + - [43, 77.557] + - - [2880, 3071, 1, 3072] + - [51, 85.041] + - - [2880, 3072, 1, 3072] + - [15, 85.19] + - - [4096, 511, 1, 4096] + - [15, 77.367] + - - [512, 3072, 1, 3072] + - [13, 72.336] + - - [512, 8191, 1, 8192] + - [32, 71.962] + - - [4096, 4095, 1, 4096] + - [13, 87.221] + - - [8192, 511, 1, 8192] + - [51, 68.335] + - - [8192, 512, 1, 8192] + - [51, 71.412] + - - [511, 3072, 1, 3072] + - [49, 71.082] + - - [7680, 8193, 1, 8192] + - [51, 86.436] + - - [2048, 2048, 1, 2048] + - [15, 79.357] + - - [3072, 512, 1, 3073] + - [37, 73.744] + - - [513, 8192, 1, 8192] + - [32, 62.875] + - - [7679, 8192, 1, 8192] + - [51, 89.486] + - - [3840, 4096, 1, 4097] + - [22, 88.24] + - - [512, 3072, 1, 3071] + - [37, 74.637] + - - [7680, 8192, 1, 8191] + - [24, 91.295] + - - [3072, 511, 1, 3072] + - [30, 70.888] + - - [8193, 8192, 1, 8192] + - [51, 85.254] + - - [512, 4096, 1, 4095] + - [43, 79.14] + - - [512, 3071, 1, 3072] + - [49, 71.479] + - - [3073, 3072, 1, 3072] + - [13, 83.259] + - - [512, 3073, 1, 3072] + - [49, 71.366] + - - [4096, 4096, 1, 4095] + - [22, 87.911] + - - [1920, 2048, 1, 2047] + - [37, 84.468] + - - [1920, 2049, 1, 2048] + - [13, 72.021] + - - [512, 8192, 1, 8191] + - [7, 81.112] + - - [3840, 4096, 1, 4096] + - [15, 86.887] + - - [8191, 512, 1, 8192] + - [32, 68.366] + - - [2881, 3072, 1, 3072] + - [32, 85.172] + - - [512, 4096, 1, 4096] + - [15, 78.928] + - - [3841, 4096, 1, 4096] + - [15, 85.786] + - - [2880, 3072, 1, 3073] + - [7, 85.168] + - - [4095, 512, 1, 4096] + - [15, 78.188] + - - [1919, 2048, 1, 2048] + - [13, 81.297] + - - [1920, 2048, 1, 2048] + - [13, 81.671] + - - [8192, 8192, 1, 8192] + - [51, 82.592] + - - [511, 4096, 1, 4096] + - [51, 78.256] + - - [8192, 513, 1, 8192] + - [44, 56.257] + - - [513, 3072, 1, 3072] + - [51, 58.842] + - - [7680, 8191, 1, 8192] + - [51, 89.292] + - - [512, 4097, 1, 4096] + - [32, 77.972] + - - [2047, 2048, 1, 2048] + - [15, 79.51] + - - [2049, 2048, 1, 2048] + - [13, 76.478] + - - [3840, 4095, 1, 4096] + - [15, 85.642] + - - [2880, 3072, 1, 3071] + - [24, 85.272] + - - [3072, 3072, 1, 3073] + - [7, 91.295] + - - [2880, 3073, 1, 3072] + - [13, 82.605] + - - [4096, 513, 1, 4096] + - [13, 60.331] + - - [4097, 512, 1, 4096] + - [15, 78.603] + - - [8192, 512, 1, 8191] + - [24, 77.566] + - - [1921, 2048, 1, 2048] + - [15, 74.38] + - - [512, 3072, 1, 3073] + - [37, 74.854] + - - [2048, 2049, 1, 2048] + - [15, 79.285] + - - [3072, 512, 1, 3071] + - [18, 74.89] + - - [3071, 3072, 1, 3072] + - [32, 91.101] + - - [3840, 4097, 1, 4096] + - [15, 81.595] + - - [2048, 2047, 1, 2048] + - [32, 78.617] + - - [2879, 3072, 1, 3072] + - [32, 85.231] + - - [3072, 513, 1, 3072] + - [51, 59.198] + - - [512, 4095, 1, 4096] + - [51, 78.342] + - - [3071, 512, 1, 3072] + - [13, 71.588] + - - [4096, 512, 1, 4096] + - [15, 79.127] + - - [4097, 4096, 1, 4096] + - [15, 84.5] + - - [2048, 2048, 1, 2047] + - [24, 79.858] + - - [3839, 4096, 1, 4096] + - [15, 86.129] + - - [512, 4096, 1, 4097] + - [43, 79.081] + - - [3072, 3073, 1, 3072] + - [13, 83.201] + - - [2048, 2048, 1, 2049] + - [24, 80.142] + - - [8191, 8192, 1, 8192] + - [51, 82.42] + - - [3072, 3071, 1, 3072] + - [32, 90.78] + - - [4096, 512, 1, 4097] + - [43, 78.964] + - - [3840, 4096, 1, 4095] + - [5, 88.249] + - - [1920, 2047, 1, 2048] + - [13, 80.43] + - - [8192, 8192, 1, 8191] + - [7, 92.202] + - - [3072, 3072, 1, 3072] + - [32, 91.155] + - - [512, 8193, 1, 8192] + - [32, 73.149] + - - [4096, 512, 1, 4095] + - [43, 78.915] + - - [8193, 512, 1, 8192] + - [32, 68.659] + - - [4095, 4096, 1, 4096] + - [13, 86.864] + - - [4096, 4097, 1, 4096] + - [13, 84.487] + - - [512, 8192, 1, 8192] + - [32, 74.511] + - - [512, 8192, 1, 8193] + - [24, 81.13] + - - [1920, 2048, 1, 2049] + - [37, 84.103] + - - [479, 3072, 1, 3072] + - [13, 67.315] + - - [479, 4096, 1, 4096] + - [32, 73.825] + - - [479, 8192, 1, 8192] + - [32, 68.538] + - - [480, 3072, 1, 3071] + - [37, 70.135] + - - [480, 3072, 1, 3073] + - [1, 70.13] + - - [480, 3073, 1, 3072] + - [13, 67.468] + - - [480, 4095, 1, 4096] + - [32, 73.839] + - - [480, 4096, 1, 4095] + - [24, 74.461] + - - [480, 4096, 1, 4097] + - [43, 74.416] + - - [480, 4097, 1, 4096] + - [32, 73.965] + - - [480, 8191, 1, 8192] + - [32, 68.488] + - - [480, 8192, 1, 8191] + - [43, 76.054] + - - [480, 8192, 1, 8193] + - [7, 76.045] + - - [480, 8193, 1, 8192] + - [51, 68.813] + - - [481, 3072, 1, 3072] + - [13, 67.008] + - - [481, 4096, 1, 4096] + - [51, 74.096] + - - [481, 8192, 1, 8192] + - [51, 70.708] + - - [3072, 479, 1, 3072] + - [30, 66.778] + - - [3072, 480, 1, 3071] + - [37, 70.076] + - - [3072, 480, 1, 3073] + - [18, 69.715] + - - [3072, 481, 1, 3072] + - [30, 67.392] + - - [3073, 480, 1, 3072] + - [49, 67.229] + - - [480, 3072, 1, 3072] + - [30, 67.265] + - - [480, 4096, 1, 4096] + - [51, 74.096] + - - [480, 8192, 1, 8192] + - [32, 69.706] + - - [3072, 480, 1, 3072] + - [49, 66.859] + - - [4096, 480, 1, 4096] + - [15, 72.991] + - - [8192, 480, 1, 8192] + - [51, 65.303] + - - [1024, 3840, 1, 1024] + - [13, 79.844] + - - [1024, 3840, 1, 4096] + - [32, 75.693] + - - [1024, 3968, 1, 1024] + - [30, 83.106] + - - [1024, 3968, 1, 4096] + - [13, 84.27] + - - [1024, 7200, 1, 1024] + - [30, 82.826] + - - [1024, 7200, 1, 4096] + - [13, 82.492] + - - [1024, 8160, 1, 1024] + - [49, 82.131] + - - [1024, 8160, 1, 4096] + - [15, 78.725] + - - [1024, 9520, 1, 1024] + - [30, 84.455] + - - [1024, 9520, 1, 4096] + - [15, 82.768] + - - [1024, 10200, 1, 1024] + - [32, 87.117] + - - [1024, 10200, 1, 4096] + - [15, 87.121] + - - [4096, 3840, 1, 1024] + - [13, 87.009] + - - [4096, 3968, 1, 1024] + - [13, 87.103] + - - [4096, 7200, 1, 1024] + - [13, 86.224] + - - [4096, 8160, 1, 1024] + - [13, 87.081] + - - [4096, 9520, 1, 1024] + - [13, 86.246] + - - [4096, 10200, 1, 1024] + - [13, 86.815] + - - [42720, 3968, 1, 1024] + - [49, 89.819] + - - [42720, 7200, 1, 1024] + - [51, 88.714] + - - [42720, 9520, 1, 1024] + - [51, 89.58] + - - [2048, 960, 1, 2048] + - [51, 72.034] + - - [2048, 960, 1, 74] + - [0, 57.985] + - - [1600, 1024, 1, 960] + - [18, 75.156] + - - [2048, 2048, 1, 960] + - [24, 79.33] + - - [4096, 1024, 1, 257] + - [18, 73.464] + - - [10240, 8976, 1, 256] + - [13, 86.612] + - - [1024, 1600, 1, 1024] + - [49, 69.472] + - - [1024, 1600, 1, 560] + - [38, 72.855] + - - [10496, 8976, 1, 256] + - [13, 87.108] + - - [11264, 8976, 1, 256] + - [13, 86.598] + - - [11776, 8976, 1, 256] + - [13, 87.248] + - - [12544, 8976, 1, 256] + - [13, 87.284] + - - [1280, 8976, 1, 256] + - [49, 80.661] + - - [13312, 8976, 1, 256] + - [13, 87.225] + - - [13568, 8976, 1, 256] + - [13, 87.487] + - - [13824, 8976, 1, 256] + - [13, 87.293] + - - [15104, 8976, 1, 256] + - [13, 87.518] + - - [15360, 8976, 1, 256] + - [13, 87.234] + - - [15872, 8976, 1, 256] + - [13, 87.451] + - - [16128, 8976, 1, 256] + - [13, 87.649] + - - [17152, 8976, 1, 256] + - [13, 87.735] + - - [1792, 8976, 1, 256] + - [49, 82.402] + - - [18176, 8976, 1, 256] + - [13, 87.739] + - - [18688, 8976, 1, 256] + - [13, 87.789] + - - [18944, 8976, 1, 256] + - [13, 87.717] + - - [19712, 8976, 1, 256] + - [13, 87.712] + - - [19968, 8976, 1, 256] + - [13, 87.69] + - - [20480, 8976, 1, 256] + - [13, 87.487] + - - [2048, 1536, 1, 512] + - [15, 72.697] + - - [2048, 1536, 1, 768] + - [32, 76.911] + - - [2048, 684, 1, 512] + - [29, 62.208] + - - [2048, 684, 1, 768] + - [29, 63.665] + - - [2048, 8976, 1, 256] + - [30, 84.089] + - - [20992, 8976, 1, 256] + - [13, 87.735] + - - [21248, 8976, 1, 256] + - [13, 87.73] + - - [2304, 8976, 1, 256] + - [24, 83.706] + - - [23552, 8976, 1, 256] + - [13, 87.609] + - - [2560, 8976, 1, 256] + - [30, 84.414] + - - [256, 10496, 1, 1024] + - [49, 75.071] + - - [256, 11264, 1, 1024] + - [49, 81.459] + - - [256, 11520, 1, 1024] + - [14, 77.178] + - - [256, 11776, 1, 1024] + - [51, 74.809] + - - [256, 12544, 1, 1024] + - [51, 79.388] + - - [256, 13312, 1, 1024] + - [32, 83.738] + - - [256, 14336, 1, 1024] + - [30, 76.0] + - - [256, 14592, 1, 1024] + - [30, 77.421] + - - [256, 14848, 1, 1024] + - [30, 78.693] + - - [256, 15104, 1, 1024] + - [49, 80.015] + - - [256, 16128, 1, 1024] + - [14, 77.773] + - - [256, 18176, 1, 1024] + - [32, 86.625] + - - [256, 18944, 1, 1024] + - [49, 79.054] + - - [256, 19200, 1, 1024] + - [49, 80.101] + - - [256, 20480, 1, 1024] + - [49, 85.258] + - - [256, 20992, 1, 1024] + - [51, 80.927] + - - [256, 21248, 1, 1024] + - [51, 81.996] + - - [256, 21504, 1, 1024] + - [32, 82.894] + - - [256, 22016, 1, 1024] + - [51, 84.712] + - - [256, 22344, 1, 1024] + - [32, 84.166] + - - [256, 23296, 1, 1024] + - [30, 80.142] + - - [256, 23552, 1, 1024] + - [30, 80.895] + - - [256, 31488, 1, 1024] + - [32, 85.096] + - - [256, 33536, 1, 1024] + - [49, 84.103] + - - [256, 44505, 1, 1024] + - [51, 83.453] + - - [256, 4608, 1, 1024] + - [32, 72.336] + - - [256, 4864, 1, 1024] + - [45, 59.848] + - - [256, 5376, 1, 1024] + - [48, 64.098] + - - [256, 5888, 1, 1024] + - [49, 67.942] + - - [256, 6144, 1, 1024] + - [13, 70.784] + - - [256, 6400, 1, 1024] + - [30, 73.505] + - - [256, 6656, 1, 1024] + - [13, 76.311] + - - [256, 7168, 1, 1024] + - [51, 67.965] + - - [256, 7424, 1, 1024] + - [32, 70.356] + - - [256, 7936, 1, 1024] + - [32, 74.755] + - - [256, 8192, 1, 1024] + - [32, 76.69] + - - [256, 8448, 1, 1024] + - [32, 79.339] + - - [256, 8960, 1, 1024] + - [51, 83.507] + - - [256, 9984, 1, 1024] + - [49, 72.684] + - - [2816, 8976, 1, 256] + - [49, 84.446] + - - [28672, 8976, 1, 256] + - [13, 87.767] + - - [3072, 8976, 1, 256] + - [30, 84.847] + - - [31488, 8976, 1, 256] + - [13, 88.024] + - - [3328, 8976, 1, 256] + - [49, 85.023] + - - [33536, 8976, 1, 256] + - [13, 88.091] + - - [3840, 8976, 1, 256] + - [30, 84.866] + - - [4096, 8976, 1, 256] + - [30, 84.505] + - - [4352, 8976, 1, 256] + - [13, 85.466] + - - [44505, 8976, 1, 256] + - [13, 87.924] + - - [4608, 8976, 1, 256] + - [30, 85.272] + - - [4864, 8976, 1, 256] + - [30, 85.614] + - - [5120, 8976, 1, 256] + - [13, 83.819] + - - [5376, 8976, 1, 256] + - [41, 85.89] + - - [5632, 8976, 1, 256] + - [30, 85.799] + - - [5888, 8976, 1, 256] + - [13, 86.052] + - - [6144, 8976, 1, 256] + - [13, 84.947] + - - [6400, 8976, 1, 256] + - [13, 86.314] + - - [684, 8976, 1, 256] + - [49, 69.548] + - - [7168, 8976, 1, 256] + - [13, 85.727] + - - [7936, 8976, 1, 256] + - [13, 86.548] + - - [8192, 8976, 1, 256] + - [5, 84.568] + - - [8448, 8976, 1, 256] + - [13, 86.941] + - - [8960, 8976, 1, 256] + - [13, 86.982] + - - [9472, 8976, 1, 256] + - [13, 86.991] + - - [9728, 8976, 1, 256] + - [13, 86.932] + - - [9984, 8976, 1, 256] + - [13, 87.036] + - - [512, 32768, 1, 13] + - [36, 29.164] + - - [256, 32768, 1, 512] + - [49, 81.983] + - - [128, 32768, 1, 512] + - [15, 76.699] + - - [1024, 32768, 1, 479] + - [1, 89.03] + - - [1024, 32768, 1, 1024] + - [30, 88.028] + - - [512, 32768, 1, 1024] + - [49, 86.684] + - - [1023, 2048, 1, 4096] + - [32, 78.378] + - - [1025, 2048, 1, 4096] + - [32, 78.878] + - - [1024, 2047, 1, 4096] + - [15, 78.527] + - - [1024, 2049, 1, 4096] + - [15, 78.73] + - - [1024, 2048, 1, 4095] + - [24, 79.497] + - - [1024, 2048, 1, 4097] + - [24, 79.433] + - - [1023, 3072, 1, 1024] + - [32, 76.456] + - - [1025, 3072, 1, 1024] + - [32, 77.092] + - - [1024, 3071, 1, 1024] + - [32, 76.135] + - - [1024, 3073, 1, 1024] + - [15, 76.469] + - - [1024, 3072, 1, 1023] + - [1, 77.868] + - - [1024, 3072, 1, 1025] + - [24, 77.963] + - - [3071, 512, 1, 1024] + - [13, 68.235] + - - [3073, 512, 1, 1024] + - [49, 68.177] + - - [3072, 511, 1, 1024] + - [49, 68.344] + - - [3072, 513, 1, 1024] + - [48, 61.368] + - - [3072, 512, 1, 1023] + - [37, 73.225] + - - [3072, 512, 1, 1025] + - [18, 72.431] + - - [128, 32768, 1, 256] + - [30, 73.18] + - - [1024, 4096, 1, 480] + - [1, 78.305] + - - [512, 4096, 1, 1024] + - [32, 77.191] + - - [512, 55296, 1, 13] + - [39, 30.603] + - - [256, 55296, 1, 512] + - [49, 86.345] + - - [128, 55296, 1, 256] + - [30, 81.879] + - - [1024, 6912, 1, 480] + - [1, 88.633] + - - [1024, 6912, 1, 1024] + - [32, 88.425] + - - [512, 6912, 1, 1024] + - [15, 86.49] + - - [256, 6912, 1, 512] + - [10, 74.317] + - - [1151, 1152, 1, 1152] + - [48, 60.728] + - - [1153, 1152, 1, 1152] + - [46, 60.516] + - - [1152, 1151, 1, 1152] + - [40, 61.183] + - - [1152, 1153, 1, 1152] + - [21, 61.486] + - - [1152, 1152, 1, 1151] + - [17, 68.533] + - - [1152, 1152, 1, 1153] + - [17, 68.804] + - - [1535, 1536, 1, 1536] + - [15, 85.267] + - - [1537, 1536, 1, 1536] + - [49, 66.742] + - - [1536, 1535, 1, 1536] + - [51, 84.216] + - - [1536, 1537, 1, 1536] + - [49, 66.904] + - - [1536, 1536, 1, 1535] + - [24, 86.918] + - - [1536, 1536, 1, 1537] + - [7, 86.729] + - - [1919, 1920, 1, 1920] + - [5, 76.726] + - - [1921, 1920, 1, 1920] + - [49, 76.69] + - - [1920, 1919, 1, 1920] + - [41, 76.753] + - - [1920, 1921, 1, 1920] + - [41, 76.713] + - - [1920, 1920, 1, 1919] + - [37, 79.749] + - - [1920, 1920, 1, 1921] + - [18, 79.492] + - - [2303, 2304, 1, 2304] + - [32, 81.148] + - - [2305, 2304, 1, 2304] + - [51, 81.36] + - - [2304, 2303, 1, 2304] + - [32, 80.823] + - - [2304, 2305, 1, 2304] + - [51, 81.301] + - - [2304, 2304, 1, 2303] + - [19, 83.963] + - - [2304, 2304, 1, 2305] + - [38, 83.828] + - - [2687, 2688, 1, 2688] + - [22, 83.11] + - - [2689, 2688, 1, 2688] + - [41, 83.101] + - - [2688, 2687, 1, 2688] + - [41, 83.147] + - - [2688, 2689, 1, 2688] + - [41, 83.106] + - - [2688, 2688, 1, 2687] + - [37, 85.998] + - - [2688, 2688, 1, 2689] + - [37, 85.772] + - - [3455, 3456, 1, 3456] + - [41, 86.142] + - - [3457, 3456, 1, 3456] + - [24, 84.699] + - - [3456, 3455, 1, 3456] + - [22, 86.219] + - - [3456, 3457, 1, 3456] + - [24, 84.117] + - - [3456, 3456, 1, 3455] + - [37, 88.971] + - - [3456, 3456, 1, 3457] + - [18, 88.858] + - - [3839, 3840, 1, 3840] + - [15, 87.23] + - - [3841, 3840, 1, 3840] + - [7, 87.252] + - - [3840, 3839, 1, 3840] + - [32, 87.252] + - - [3840, 3841, 1, 3840] + - [41, 85.763] + - - [3840, 3840, 1, 3839] + - [3, 87.415] + - - [3840, 3840, 1, 3841] + - [24, 87.261] + - - [4223, 4224, 1, 4224] + - [22, 87.622] + - - [4225, 4224, 1, 4224] + - [22, 85.019] + - - [4224, 4223, 1, 4224] + - [5, 87.649] + - - [4224, 4225, 1, 4224] + - [24, 87.437] + - - [4224, 4224, 1, 4223] + - [18, 90.397] + - - [4224, 4224, 1, 4225] + - [18, 90.338] + - - [4607, 4608, 1, 4608] + - [32, 91.953] + - - [4609, 4608, 1, 4608] + - [32, 87.46] + - - [4608, 4607, 1, 4608] + - [51, 91.935] + - - [4608, 4609, 1, 4608] + - [32, 87.315] + - - [4608, 4608, 1, 4607] + - [24, 92.332] + - - [4608, 4608, 1, 4609] + - [24, 92.319] + - - [4991, 4992, 1, 4992] + - [24, 88.813] + - - [4993, 4992, 1, 4992] + - [22, 86.485] + - - [4992, 4991, 1, 4992] + - [24, 88.831] + - - [4992, 4993, 1, 4992] + - [7, 88.908] + - - [4992, 4992, 1, 4991] + - [18, 91.218] + - - [4992, 4992, 1, 4993] + - [18, 91.096] + - - [5375, 5376, 1, 5376] + - [7, 90.726] + - - [5377, 5376, 1, 5376] + - [24, 87.451] + - - [5376, 5375, 1, 5376] + - [7, 90.613] + - - [5376, 5377, 1, 5376] + - [51, 86.954] + - - [5376, 5376, 1, 5375] + - [24, 90.604] + - - [5376, 5376, 1, 5377] + - [24, 90.595] + - - [5759, 5760, 1, 5760] + - [24, 89.607] + - - [5761, 5760, 1, 5760] + - [30, 87.446] + - - [5760, 5759, 1, 5760] + - [43, 89.562] + - - [5760, 5761, 1, 5760] + - [43, 89.585] + - - [5760, 5760, 1, 5759] + - [37, 90.284] + - - [5760, 5760, 1, 5761] + - [24, 89.413] + - - [6143, 6144, 1, 6144] + - [51, 88.931] + - - [6145, 6144, 1, 6144] + - [51, 87.875] + - - [6144, 6143, 1, 6144] + - [51, 88.615] + - - [6144, 6145, 1, 6144] + - [51, 85.082] + - - [6144, 6144, 1, 6143] + - [24, 92.08] + - - [6144, 6144, 1, 6145] + - [7, 92.066] + - - [6527, 6528, 1, 6528] + - [24, 89.526] + - - [6529, 6528, 1, 6528] + - [22, 88.123] + - - [6528, 6527, 1, 6528] + - [43, 89.467] + - - [6528, 6529, 1, 6528] + - [24, 89.558] + - - [6528, 6528, 1, 6527] + - [24, 89.228] + - - [6528, 6528, 1, 6529] + - [7, 89.219] + - - [6911, 6912, 1, 6912] + - [32, 91.498] + - - [6913, 6912, 1, 6912] + - [24, 89.504] + - - [6912, 6911, 1, 6912] + - [32, 91.57] + - - [6912, 6913, 1, 6912] + - [30, 88.123] + - - [6912, 6912, 1, 6911] + - [24, 91.547] + - - [6912, 6912, 1, 6913] + - [7, 91.561] + - - [7295, 7296, 1, 7296] + - [30, 89.571] + - - [7297, 7296, 1, 7296] + - [24, 88.845] + - - [7296, 7295, 1, 7296] + - [41, 89.621] + - - [7296, 7297, 1, 7296] + - [43, 89.648] + - - [7296, 7296, 1, 7295] + - [37, 90.162] + - - [7296, 7296, 1, 7297] + - [5, 89.49] + - - [7679, 7680, 1, 7680] + - [51, 91.981] + - - [7681, 7680, 1, 7680] + - [51, 89.44] + - - [7680, 7679, 1, 7680] + - [51, 91.845] + - - [7680, 7681, 1, 7680] + - [51, 88.461] + - - [7680, 7680, 1, 7679] + - [7, 92.296] + - - [7680, 7680, 1, 7681] + - [24, 92.319] + - - [1152, 1152, 1, 1152] + - [40, 61.093] + - - [1536, 1536, 1, 1536] + - [32, 86.49] + - - [1920, 1920, 1, 1920] + - [18, 77.331] + - - [2304, 2304, 1, 2304] + - [51, 81.653] + - - [2688, 2688, 1, 2688] + - [22, 83.129] + - - [3456, 3456, 1, 3456] + - [41, 86.16] + - - [3840, 3840, 1, 3840] + - [51, 87.234] + - - [4224, 4224, 1, 4224] + - [41, 87.708] + - - [4608, 4608, 1, 4608] + - [32, 92.152] + - - [4992, 4992, 1, 4992] + - [24, 88.822] + - - [5376, 5376, 1, 5376] + - [7, 90.618] + - - [5760, 5760, 1, 5760] + - [7, 89.576] + - - [6144, 6144, 1, 6144] + - [51, 88.601] + - - [6528, 6528, 1, 6528] + - [43, 89.571] + - - [6912, 6912, 1, 6912] + - [51, 91.57] + - - [7296, 7296, 1, 7296] + - [13, 89.616] + - - [7680, 7680, 1, 7680] + - [51, 91.777] + - - [256, 128, 49, 1152] + - [13, 70.721] + - - [256, 128, 121, 120] + - [2, 68.276] + - - [256, 128, 169, 120] + - [26, 72.526] + - - [256, 128, 36, 120] + - [17, 55.057] + - - [256, 128, 49, 120] + - [17, 58.188] + - - [256, 128, 64, 120] + - [18, 61.91] + - - [256, 128, 36, 12000] + - [19, 73.428] + - - [256, 128, 49, 1216] + - [18, 74.286] + - - [256, 128, 121, 18] + - [8, 25.356] + - - [256, 128, 169, 18] + - [4, 31.275] + - - [256, 128, 36, 18] + - [0, 18.71] + - - [256, 128, 49, 18] + - [0, 21.733] + - - [256, 128, 64, 18] + - [8, 23.921] + - - [256, 128, 36, 1800] + - [47, 79.226] + - - [256, 128, 121, 19] + - [4, 29.15] + - - [256, 128, 169, 19] + - [4, 32.471] + - - [256, 128, 36, 19] + - [8, 19.297] + - - [256, 128, 49, 19] + - [8, 22.712] + - - [256, 128, 64, 19] + - [0, 24.9] + - - [256, 128, 36, 1900] + - [19, 79.361] + - - [256, 128, 49, 480] + - [37, 71.078] + - - [256, 128, 81, 480] + - [22, 73.076] + - - [256, 128, 64, 5880] + - [23, 64.897] + - - [256, 128, 49, 72] + - [36, 46.159] + - - [256, 128, 81, 72] + - [38, 53.288] + - - [256, 128, 49, 76] + - [0, 47.545] + - - [256, 128, 81, 76] + - [36, 49.895] + - - [256, 128, 49, 7680] + - [35, 54.145] + - - [256, 128, 64, 882] + - [18, 74.272] + - - [256, 128, 64, 931] + - [37, 75.314] + - - [256, 256, 49, 1152] + - [15, 78.454] + - - [256, 256, 36, 12000] + - [7, 89.07] + - - [256, 256, 49, 1216] + - [7, 79.023] + - - [256, 256, 36, 1800] + - [7, 85.872] + - - [256, 256, 36, 1900] + - [43, 86.048] + - - [256, 256, 64, 5880] + - [24, 80.552] + - - [256, 256, 49, 7680] + - [34, 64.847] + - - [256, 256, 64, 882] + - [1, 77.308] + - - [256, 256, 64, 931] + - [1, 77.277] + - - [340, 256, 49, 1152] + - [49, 68.461] + - - [340, 256, 36, 120] + - [36, 57.822] + - - [340, 256, 49, 120] + - [38, 62.307] + - - [340, 256, 64, 120] + - [38, 65.339] + - - [340, 256, 36, 12000] + - [24, 79.848] + - - [340, 256, 49, 1216] + - [1, 69.963] + - - [340, 256, 36, 18] + - [36, 19.017] + - - [340, 256, 49, 18] + - [19, 23.659] + - - [340, 256, 64, 18] + - [45, 24.693] + - - [340, 256, 36, 1800] + - [7, 77.313] + - - [340, 256, 36, 19] + - [25, 19.955] + - - [340, 256, 49, 19] + - [25, 24.278] + - - [340, 256, 64, 19] + - [25, 25.87] + - - [340, 256, 36, 1900] + - [7, 77.272] + - - [340, 256, 64, 5880] + - [18, 77.529] + - - [340, 256, 49, 7680] + - [16, 58.905] + - - [340, 256, 64, 882] + - [18, 74.809] + - - [340, 256, 64, 931] + - [37, 74.705] + - - [510, 256, 49, 120] + - [47, 64.549] + - - [510, 256, 64, 120] + - [19, 68.835] + - - [510, 256, 49, 18] + - [38, 19.4] + - - [510, 256, 64, 18] + - [36, 20.303] + - - [510, 256, 49, 19] + - [25, 20.176] + - - [510, 256, 64, 19] + - [17, 21.345] + - - [510, 256, 36, 480] + - [1, 83.72] + - - [510, 256, 36, 72] + - [19, 51.131] + - - [510, 256, 36, 76] + - [50, 50.617] + - - [510, 512, 36, 1080] + - [37, 88.673] + - - [510, 512, 36, 162] + - [38, 73.216] + - - [510, 512, 36, 171] + - [13, 74.674] + - - [510, 512, 49, 1920] + - [51, 88.015] + - - [510, 512, 64, 1920] + - [22, 86.797] + - - [510, 512, 49, 288] + - [22, 82.921] + - - [510, 512, 64, 288] + - [1, 84.554] + - - [510, 512, 36, 3000] + - [24, 90.072] + - - [510, 512, 49, 304] + - [37, 82.925] + - - [510, 512, 64, 304] + - [18, 84.793] + - - [510, 512, 36, 450] + - [22, 83.562] + - - [510, 512, 36, 475] + - [41, 84.432] + - - [510, 512, 49, 480] + - [1, 87.36] + - - [510, 512, 64, 480] + - [1, 85.745] + - - [510, 512, 49, 72] + - [19, 64.062] + - - [510, 512, 64, 72] + - [50, 66.318] + - - [510, 512, 49, 76] + - [23, 67.166] + - - [510, 512, 64, 76] + - [23, 69.02] + - - [512, 256, 81, 1080] + - [37, 88.543] + - - [512, 256, 25, 12000] + - [7, 83.864] + - - [512, 256, 81, 162] + - [1, 79.086] + - - [512, 256, 81, 171] + - [5, 81.698] + - - [512, 256, 25, 1800] + - [7, 81.41] + - - [512, 256, 25, 1900] + - [43, 81.604] + - - [512, 256, 121, 1920] + - [5, 87.365] + - - [512, 256, 169, 1920] + - [32, 89.287] + - - [512, 256, 49, 1920] + - [41, 83.986] + - - [512, 256, 121, 288] + - [22, 79.442] + - - [512, 256, 169, 288] + - [5, 79.415] + - - [512, 256, 49, 288] + - [1, 84.297] + - - [512, 256, 25, 3000] + - [7, 82.474] + - - [512, 256, 81, 3000] + - [7, 91.096] + - - [512, 256, 121, 304] + - [22, 82.001] + - - [512, 256, 169, 304] + - [5, 79.654] + - - [512, 256, 49, 304] + - [1, 84.378] + - - [512, 256, 25, 450] + - [1, 77.611] + - - [512, 256, 81, 450] + - [5, 84.681] + - - [512, 256, 25, 475] + - [37, 76.916] + - - [512, 256, 81, 475] + - [1, 84.342] + - - [512, 256, 121, 480] + - [41, 85.457] + - - [512, 256, 169, 480] + - [41, 86.291] + - - [512, 256, 49, 5880] + - [37, 88.479] + - - [512, 256, 121, 72] + - [1, 75.499] + - - [512, 256, 169, 72] + - [9, 78.229] + - - [512, 256, 121, 76] + - [19, 72.743] + - - [512, 256, 169, 76] + - [13, 74.935] + - - [512, 256, 49, 882] + - [1, 85.66] + - - [512, 256, 49, 931] + - [1, 85.971] + - - [2304, 512, 1, 100] + - [38, 51.375] + - - [2304, 512, 1, 361] + - [38, 73.5] + - - [4608, 510, 1, 100] + - [0, 58.711] + - - [4608, 510, 1, 361] + - [1, 75.228] + - - [8192, 7680, 1, 8192] + - [32, 81.937] + - - [4096, 3840, 1, 4096] + - [15, 84.432] + - - [2048, 1920, 1, 2048] + - [13, 81.211] + - - [30522, 616, 1, 1024] + - [13, 82.619] + - - [128, 128, 128, 64] + - [19, 47.757] + - - [128, 128, 160, 64] + - [6, 50.396] + - - [1024, 1280, 1, 1024] + - [29, 60.895] + - - [1024, 1280, 1, 4096] + - [30, 60.651] + - - [4096, 1280, 1, 1024] + - [13, 84.211] + - - [30522, 200, 1, 1024] + - [15, 64.179] + - - [128, 128, 624, 64] + - [27, 61.197] + - - [1024, 4992, 1, 1024] + - [30, 82.479] + - - [1024, 4992, 1, 4096] + - [32, 78.883] + - - [4096, 4992, 1, 1024] + - [13, 87.622] + - - [30522, 780, 1, 1024] + - [13, 76.28] + - - [30522, 308, 1, 1024] + - [13, 69.494] + - - [128, 128, 640, 64] + - [10, 64.698] + - - [1024, 5120, 1, 1024] + - [30, 84.496] + - - [1024, 5120, 1, 4096] + - [32, 80.918] + - - [4096, 5120, 1, 1024] + - [15, 88.669] + - - [30522, 800, 1, 1024] + - [13, 78.148] + - - [128, 128, 656, 64] + - [4, 61.332] + - - [1024, 5248, 1, 1024] + - [32, 80.169] + - - [1024, 5248, 1, 4096] + - [51, 82.551] + - - [4096, 5248, 1, 1024] + - [13, 87.568] + - - [30522, 820, 1, 1024] + - [13, 80.006] + - - [512, 512, 80, 64] + - [38, 77.994] + - - [1024, 2560, 1, 1024] + - [30, 74.128] + - - [1024, 2560, 1, 4096] + - [13, 72.21] + - - [4096, 2560, 1, 1024] + - [15, 87.947] + - - [30522, 385, 1, 1024] + - [13, 65.082] + - - [30522, 462, 1, 1024] + - [13, 77.732] + - - [128, 128, 144, 64] + - [50, 46.092] + - - [1024, 1152, 1, 1024] + - [13, 73.234] + - - [1024, 1152, 1, 4096] + - [30, 76.848] + - - [4096, 1152, 1, 1024] + - [13, 84.369] + - - [30522, 180, 1, 1024] + - [12, 65.776] + - - [1024, 8192, 1, 1024] + - [30, 82.551] + - - [1024, 8192, 1, 4096] + - [15, 79.023] + - - [1024, 9600, 1, 1024] + - [13, 85.605] + - - [1024, 9600, 1, 4096] + - [15, 82.402] + - - [4096, 8192, 1, 1024] + - [13, 87.794] + - - [4096, 9600, 1, 1024] + - [13, 86.905] + - - [33712, 8192, 1, 1024] + - [51, 90.952] + - - [33712, 9600, 1, 1024] + - [51, 89.63] + - - [1024, 10064, 1, 1024] + - [32, 86.192] + - - [1024, 10064, 1, 4096] + - [32, 86.242] + - - [1024, 10080, 1, 1024] + - [32, 86.314] + - - [1024, 10080, 1, 4096] + - [15, 85.899] + - - [1024, 6528, 1, 1024] + - [32, 82.407] + - - [1024, 6528, 1, 4096] + - [15, 84.527] + - - [1024, 7104, 1, 1024] + - [30, 81.364] + - - [1024, 7104, 1, 4096] + - [15, 80.327] + - - [1024, 8064, 1, 1024] + - [13, 86.395] + - - [1024, 8064, 1, 4096] + - [15, 79.248] + - - [1024, 9216, 1, 1024] + - [32, 88.691] + - - [1024, 9216, 1, 4096] + - [15, 88.538] + - - [4096, 10064, 1, 1024] + - [13, 86.431] + - - [4096, 10080, 1, 1024] + - [13, 86.472] + - - [4096, 6528, 1, 1024] + - [13, 87.297] + - - [4096, 7104, 1, 1024] + - [15, 87.293] + - - [4096, 8064, 1, 1024] + - [13, 87.685] + - - [4096, 9216, 1, 1024] + - [13, 87.252] + - - [42720, 10080, 1, 1024] + - [51, 90.099] + - - [42720, 6528, 1, 1024] + - [49, 89.833] + - - [42720, 7104, 1, 1024] + - [51, 90.0] + - - [1024, 32768, 1, 480] + - [1, 91.277] + - - [30592, 1024, 1, 2048] + - [49, 87.983] + - - [6144, 1024, 1, 2048] + - [13, 84.351] + - - [8192, 1024, 1, 2048] + - [13, 83.557] + - - [30592, 8192, 1, 1024] + - [51, 91.01] + - - [3072, 8192, 1, 1024] + - [30, 88.168] + - - [512, 512, 256, 64] + - [265, 58.96] + - - [30592, 2048, 1, 1024] + - [51, 89.467] + - - [30592, 4096, 1, 1024] + - [51, 90.618] + - - [3072, 4096, 1, 1024] + - [32, 87.203] + - - [1920, 2048, 1, 2560] + - [13, 81.987] + - - [2560, 2048, 1, 2560] + - [30, 85.858] + - - [2560, 2048, 1, 640] + - [41, 84.708] + - - [7680, 2048, 1, 2560] + - [13, 87.943] + - - [512, 512, 40, 64] + - [38, 74.064] + - - [1536, 4096, 1, 1536] + - [30, 84.572] + - - [1536, 4096, 1, 6144] + - [13, 82.795] + - - [4608, 4096, 1, 1536] + - [32, 91.385] + - - [50304, 4096, 1, 1536] + - [32, 91.759] + - - [6144, 4096, 1, 1536] + - [13, 88.276] + - - [1024, 1024, 64, 96] + - [0, 64.206] + - - [1536, 8192, 1, 1536] + - [32, 88.331] + - - [1536, 8192, 1, 6144] + - [32, 88.349] + - - [4608, 8192, 1, 1536] + - [32, 91.155] + - - [50304, 8192, 1, 1536] + - [32, 91.863] + - - [6144, 8192, 1, 1536] + - [15, 89.472] + - - [1024, 1024, 128, 96] + - [0, 62.767] + - - [1024, 16384, 1, 1024] + - [30, 86.851] + - - [1024, 16384, 1, 4096] + - [32, 85.235] + - - [3072, 16384, 1, 1024] + - [30, 88.173] + - - [4096, 16384, 1, 1024] + - [13, 87.78] + - - [50304, 16384, 1, 1024] + - [51, 91.398] + - - [1024, 1024, 256, 64] + - [12, 48.235] + - - [50304, 2048, 1, 1024] + - [51, 90.767] + - - [1024, 1024, 32, 64] + - [0, 72.53] + - - [50304, 4096, 1, 1024] + - [51, 91.236] + - - [1024, 1024, 64, 64] + - [0, 58.81] + - - [50304, 8192, 1, 1024] + - [51, 91.426] + - - [1024, 1024, 128, 64] + - [0, 50.4] + - - [30528, 8192, 1, 1024] + - [51, 90.871] + - - [128, 128, 1024, 64] + - [14, 68.461] + - - [1024, 3456, 1, 1024] + - [30, 82.56] + - - [1024, 3456, 1, 480] + - [1, 85.669] + - - [512, 3456, 1, 1024] + - [14, 76.131] + - - [512, 3456, 1, 13] + - [31, 19.008] + - - [512, 4096, 1, 13] + - [31, 20.637] + - - [512, 6912, 1, 13] + - [36, 23.375] + - - [30528, 640, 1, 1024] + - [13, 86.165] + - - [30528, 1280, 1, 1024] + - [13, 88.38] + - - [30528, 1600, 1, 1024] + - [13, 85.285] + - - [1024, 10240, 1, 1024] + - [32, 88.051] + - - [4096, 10240, 1, 1024] + - [13, 87.555] + - - [1024, 10240, 1, 4096] + - [32, 86.986] + - - [128, 128, 1280, 64] + - [0, 31.541] + - - [1024, 10496, 1, 4096] + - [15, 82.786] + - - [30528, 1640, 1, 1024] + - [13, 87.527] + - - [4096, 10496, 1, 1024] + - [13, 87.203] + - - [1024, 10496, 1, 1024] + - [13, 84.324] + - - [128, 128, 1312, 64] + - [0, 34.086] + - - [30528, 160, 1, 1024] + - [12, 59.32] + - - [30528, 240, 1, 1024] + - [30, 76.803] + - - [1024, 6144, 1, 1024] + - [30, 83.435] + - - [4096, 6144, 1, 1024] + - [13, 87.988] + - - [1024, 6144, 1, 4096] + - [32, 81.256] + - - [512, 512, 192, 64] + - [265, 64.719] + - - [1024, 10224, 1, 1024] + - [32, 87.329] + - - [1024, 10192, 1, 1024] + - [32, 87.6] + - - [1024, 10208, 1, 1024] + - [32, 87.87] + - - [1024, 10224, 1, 4096] + - [32, 87.604] + - - [4096, 10224, 1, 1024] + - [13, 87.13] + - - [3072, 10224, 1, 1024] + - [30, 87.487] + - - [3072, 10240, 1, 1024] + - [30, 87.406] + - - [1024, 10192, 1, 4096] + - [32, 86.914] + - - [4096, 10192, 1, 1024] + - [13, 87.171] + - - [3072, 10192, 1, 1024] + - [30, 86.837] + - - [3072, 10200, 1, 1024] + - [32, 87.23] + - - [1024, 10184, 1, 1024] + - [32, 87.049] + - - [3072, 10208, 1, 1024] + - [30, 87.176] + - - [1024, 10208, 1, 4096] + - [15, 84.396] + - - [4096, 10208, 1, 1024] + - [13, 86.828] + - - [2048, 10224, 1, 1024] + - [15, 88.759] + - - [2048, 10240, 1, 1024] + - [15, 89.355] + - - [1024, 10120, 1, 1024] + - [32, 86.684] + - - [2048, 10192, 1, 1024] + - [15, 88.534] + - - [1024, 10152, 1, 1024] + - [32, 86.842] + - - [3072, 10080, 1, 1024] + - [13, 87.279] + - - [256, 256, 25, 12544] + - [13, 69.138] + - - [256, 256, 49, 3200] + - [32, 77.399] + - - [256, 256, 25, 6272] + - [30, 74.164] + - - [256, 256, 49, 6400] + - [32, 70.27] + - - [512, 512, 49, 1152] + - [24, 87.92] + - - [512, 512, 25, 2048] + - [13, 69.264] + - - [512, 512, 49, 2304] + - [51, 89.526] + - - [512, 512, 25, 4096] + - [53, 65.943] + - - [128, 128, 2048, 64] + - [252, 34.519] + - - [30528, 2560, 1, 1024] + - [13, 89.377] + - - [128, 128, 1536, 64] + - [251, 38.611] + - - [1024, 12288, 1, 1024] + - [32, 87.243] + - - [1024, 12288, 1, 4096] + - [32, 86.855] + - - [30528, 1920, 1, 1024] + - [13, 89.007] + - - [4096, 12288, 1, 1024] + - [13, 87.207] + - - [128, 128, 81, 12544] + - [33, 47.626] + - - [128, 128, 121, 9216] + - [52, 40.957] + - - [128, 128, 169, 6400] + - [52, 50.915] + - - [256, 256, 36, 4096] + - [34, 57.024] + - - [256, 256, 49, 2304] + - [15, 74.637] + - - [256, 256, 64, 2304] + - [51, 72.048] + - - [256, 256, 81, 4096] + - [53, 53.847] + - - [256, 256, 121, 2304] + - [32, 74.29] + - - [256, 256, 169, 2304] + - [32, 75.179] + - - [512, 512, 81, 1024] + - [49, 76.911] + - - [512, 512, 121, 1024] + - [13, 77.498] + - - [512, 512, 169, 1024] + - [49, 78.806] + - - [512, 512, 36, 1024] + - [13, 74.191] + - - [512, 512, 49, 1024] + - [30, 72.03] + - - [512, 512, 64, 1024] + - [30, 73.875] + - - [128, 128, 192, 64] + - [42, 46.723] + - - [768, 2048, 1, 768] + - [49, 69.819] + - - [3072, 2048, 1, 768] + - [30, 83.846] + - - [768, 2048, 1, 3072] + - [49, 72.237] + - - [384, 384, 144, 64] + - [18, 80.042] + - - [768, 4608, 1, 768] + - [32, 85.727] + - - [3072, 4608, 1, 768] + - [15, 89.481] + - - [768, 4608, 1, 3072] + - [32, 89.098] + - - [512, 512, 48, 64] + - [38, 76.587] + - - [128, 128, 256, 64] + - [21, 56.437] + - - [384, 384, 192, 64] + - [263, 69.034] + - - [1024, 4608, 1, 1024] + - [15, 86.002] + - - [4096, 4608, 1, 1024] + - [15, 90.108] + - - [1024, 4608, 1, 4096] + - [15, 89.968] + - - [256, 256, 36, 432] + - [1, 81.184] + - - [256, 256, 36, 456] + - [1, 82.38] + - - [256, 256, 36, 504] + - [1, 82.84] + - - [256, 256, 49, 1120] + - [37, 79.217] + - - [256, 256, 36, 442] + - [1, 78.572] + - - [256, 256, 49, 950] + - [18, 78.725] + - - [256, 256, 64, 616] + - [1, 78.287] + - - [256, 256, 64, 660] + - [1, 77.845] + - - [256, 256, 36, 408] + - [1, 81.116] + - - [256, 256, 49, 1008] + - [37, 79.767] + - - [256, 256, 36, 462] + - [1, 80.462] + - - [256, 256, 36, 468] + - [1, 81.292] + - - [256, 256, 36, 494] + - [1, 81.576] + - - [512, 512, 64, 48] + - [38, 73.577] + - - [256, 256, 64, 140] + - [38, 68.601] + - - [512, 512, 64, 56] + - [27, 77.042] + - - [512, 512, 49, 90] + - [19, 76.884] + - - [512, 512, 49, 60] + - [19, 72.201] + - - [256, 256, 49, 864] + - [43, 78.572] + - - [256, 256, 64, 224] + - [17, 74.263] + - - [256, 256, 64, 176] + - [1, 73.392] + - - [256, 256, 64, 154] + - [1, 70.527] + - - [512, 512, 49, 80] + - [1, 80.074] + - - [256, 256, 49, 1200] + - [18, 79.294] + - - [256, 256, 64, 704] + - [37, 77.755] + - - [256, 256, 64, 768] + - [51, 77.232] + - - [256, 256, 49, 1160] + - [20, 78.278] + - - [256, 256, 49, 320] + - [18, 73.406] + - - [512, 512, 49, 70] + - [19, 73.546] + - - [256, 256, 49, 1240] + - [24, 78.59] + - - [256, 256, 36, 384] + - [32, 74.642] + - - [1024, 2048, 1, 888] + - [24, 76.884] + - - [1024, 2048, 1, 713] + - [24, 75.955] + - - [1024, 2048, 1, 660] + - [20, 75.603] + - - [1024, 2048, 1, 726] + - [24, 75.72] + - - [1024, 2048, 1, 672] + - [24, 76.239] + - - [1024, 2048, 1, 850] + - [24, 76.451] + - - [1024, 2048, 1, 805] + - [24, 76.014] + - - [1024, 2048, 1, 864] + - [24, 77.15] + - - [1024, 2048, 1, 768] + - [32, 75.774] + - - [1024, 2048, 1, 950] + - [24, 76.956] + - - [1024, 1024, 160, 96] + - [0, 65.988] + - - [2880, 16384, 1, 1920] + - [7, 89.066] + - - [1920, 16384, 1, 960] + - [37, 91.421] + - - [3840, 16384, 1, 1920] + - [24, 91.516] + - - [1920, 16384, 1, 3840] + - [24, 91.308] + - - [25216, 16384, 1, 1920] + - [24, 92.071] + - - [1024, 1024, 40, 96] + - [0, 80.169] + - - [2880, 4096, 1, 1920] + - [22, 84.532] + - - [1920, 4096, 1, 960] + - [37, 86.58] + - - [3840, 4096, 1, 1920] + - [5, 88.06] + - - [1920, 4096, 1, 3840] + - [51, 86.963] + - - [25216, 4096, 1, 1920] + - [24, 91.737] + - - [1024, 1024, 80, 96] + - [0, 71.696] + - - [2880, 8192, 1, 1920] + - [24, 87.821] + - - [1920, 8192, 1, 960] + - [37, 90.731] + - - [3840, 8192, 1, 1920] + - [24, 90.988] + - - [1920, 8192, 1, 3840] + - [30, 88.326] + - - [25216, 8192, 1, 1920] + - [43, 91.922] + - - [1024, 1024, 96, 96] + - [0, 68.908] + - - [1728, 16384, 1, 2304] + - [43, 87.703] + - - [2304, 16384, 1, 576] + - [1, 91.723] + - - [2304, 16384, 1, 2304] + - [51, 92.053] + - - [12672, 16384, 1, 2304] + - [7, 92.084] + - - [1024, 1024, 24, 96] + - [9, 83.512] + - - [1728, 4096, 1, 2304] + - [13, 81.094] + - - [2304, 4096, 1, 576] + - [37, 90.343] + - - [2304, 4096, 1, 2304] + - [32, 91.182] + - - [12672, 4096, 1, 2304] + - [7, 90.659] + - - [1024, 1024, 48, 96] + - [0, 78.739] + - - [1728, 8192, 1, 2304] + - [13, 85.258] + - - [2304, 8192, 1, 576] + - [37, 91.34] + - - [2304, 8192, 1, 2304] + - [24, 91.538] + - - [12672, 8192, 1, 2304] + - [24, 91.732] + - - [1024, 1024, 16, 96] + - [1, 81.739] + - - [1152, 4096, 1, 3072] + - [32, 89.919] + - - [3072, 4096, 1, 384] + - [24, 85.384] + - - [1536, 4096, 1, 3072] + - [49, 84.572] + - - [3072, 4096, 1, 1536] + - [15, 88.218] + - - [6400, 4096, 1, 3072] + - [15, 89.034] + - - [1024, 1024, 32, 96] + - [0, 80.408] + - - [1152, 8192, 1, 3072] + - [51, 91.186] + - - [3072, 8192, 1, 384] + - [5, 87.388] + - - [1536, 8192, 1, 3072] + - [51, 88.687] + - - [3072, 8192, 1, 1536] + - [30, 88.213] + - - [6400, 8192, 1, 3072] + - [32, 90.835] + - - [2048, 4096, 1, 2048] + - [13, 83.318] + - - [2048, 4096, 1, 4096] + - [13, 82.704] + - - [29000, 199, 1, 2048] + - [51, 59.311] + - - [29000, 221, 1, 2048] + - [51, 61.387] + - - [29000, 224, 1, 2048] + - [32, 64.265] + - - [29000, 229, 1, 2048] + - [51, 63.706] + - - [29000, 234, 1, 2048] + - [51, 67.428] + - - [29000, 242, 1, 2048] + - [32, 70.387] + - - [29000, 246, 1, 2048] + - [51, 69.792] + - - [29000, 247, 1, 2048] + - [51, 69.968] + - - [29000, 256, 1, 2048] + - [32, 71.633] + - - [29000, 262, 1, 2048] + - [22, 50.125] + - - [29000, 264, 1, 2048] + - [30, 50.558] + - - [29000, 265, 1, 2048] + - [48, 50.355] + - - [29000, 274, 1, 2048] + - [48, 51.411] + - - [29000, 277, 1, 2048] + - [30, 52.747] + - - [29000, 279, 1, 2048] + - [30, 52.774] + - - [29000, 288, 1, 2048] + - [30, 55.156] + - - [29000, 296, 1, 2048] + - [30, 54.781] + - - [29000, 315, 1, 2048] + - [40, 59.302] + - - [29000, 335, 1, 2048] + - [30, 62.745] + - - [4096, 4096, 1, 2048] + - [13, 87.649] + - - [29000, 2283, 1, 1024] + - [13, 88.376] + - - [29000, 2296, 1, 1024] + - [15, 89.517] + - - [29000, 2306, 1, 1024] + - [13, 84.802] + - - [29000, 2309, 1, 1024] + - [13, 84.861] + - - [29000, 2318, 1, 1024] + - [13, 85.226] + - - [29000, 2320, 1, 1024] + - [13, 85.321] + - - [29000, 2324, 1, 1024] + - [13, 85.439] + - - [29000, 2325, 1, 1024] + - [13, 85.457] + - - [29000, 2329, 1, 1024] + - [13, 85.596] + - - [29000, 2338, 1, 1024] + - [13, 85.944] + - - [29000, 2345, 1, 1024] + - [13, 86.205] + - - [29000, 2350, 1, 1024] + - [13, 86.395] + - - [29000, 2362, 1, 1024] + - [13, 86.815] + - - [29000, 2366, 1, 1024] + - [13, 86.932] + - - [29000, 2368, 1, 1024] + - [13, 87.054] + - - [29000, 2374, 1, 1024] + - [13, 87.261] + - - [29000, 2390, 1, 1024] + - [13, 87.848] + - - [512, 512, 320, 64] + - [0, 47.824] + - - [29000, 561, 1, 1024] + - [13, 76.194] + - - [29000, 574, 1, 1024] + - [13, 77.872] + - - [29000, 600, 1, 1024] + - [13, 81.202] + - - [29000, 608, 1, 1024] + - [13, 82.325] + - - [29000, 615, 1, 1024] + - [13, 83.237] + - - [29000, 622, 1, 1024] + - [13, 84.189] + - - [29000, 625, 1, 1024] + - [13, 84.446] + - - [29000, 626, 1, 1024] + - [13, 84.816] + - - [29000, 628, 1, 1024] + - [13, 84.992] + - - [29000, 636, 1, 1024] + - [13, 85.885] + - - [29000, 651, 1, 1024] + - [15, 74.881] + - - [29000, 658, 1, 1024] + - [13, 75.292] + - - [29000, 669, 1, 1024] + - [15, 76.943] + - - [29000, 670, 1, 1024] + - [13, 76.893] + - - [29000, 672, 1, 1024] + - [15, 77.087] + - - [29000, 684, 1, 1024] + - [15, 78.599] + - - [29000, 716, 1, 1024] + - [15, 82.213] + - - [29000, 730, 1, 1024] + - [15, 83.638] + - - [2560, 1024, 1, 2560] + - [49, 74.895] + - - [2560, 1024, 1, 4096] + - [49, 73.821] + - - [1024, 1024, 512, 64] + - [12, 48.284] + - - [1024, 32768, 1, 4096] + - [32, 89.684] + - - [3072, 32768, 1, 1024] + - [30, 89.084] + - - [4096, 32768, 1, 1024] + - [13, 88.407] + - - [50304, 32768, 1, 1024] + - [51, 89.761] + - - [1024, 1024, 24, 128] + - [5, 83.756] + - - [128, 1024, 24, 1024] + - [51, 76.686] + - - [4096, 256, 1, 12288] + - [34, 62.406] + - - [2048, 256, 1, 13312] + - [56, 72.733] + - - [4096, 256, 1, 15360] + - [53, 62.239] + - - [2048, 512, 1, 16640] + - [67, 76.591] + - - [4096, 256, 1, 14336] + - [53, 62.551] + - - [1024, 1024, 1, 8192] + - [51, 74.078] + - - [1024, 512, 1, 16384] + - [56, 68.014] + - - [4096, 256, 1, 9216] + - [67, 64.36] + - - [1024, 512, 1, 12288] + - [56, 70.852] + - - [4096, 200, 1, 12288] + - [59, 49.124] + - - [1024, 1024, 1, 13312] + - [56, 75.49] + - - [2048, 256, 1, 16384] + - [67, 62.221] + - - [2048, 512, 1, 16384] + - [60, 67.13] + - - [1024, 1024, 1, 8320] + - [24, 74.218] + - - [2048, 256, 1, 14336] + - [60, 72.242] + - - [4096, 200, 1, 16640] + - [58, 59.031] + - - [1024, 1024, 1, 16640] + - [56, 76.537] + - - [1024, 1024, 1, 14336] + - [56, 75.725] + - - [2048, 512, 1, 9216] + - [32, 74.168] + - - [1024, 1024, 1, 15360] + - [56, 76.041] + - - [2048, 512, 1, 8192] + - [32, 74.078] + - - [2048, 512, 1, 13312] + - [60, 75.517] + - - [1024, 1024, 1, 11264] + - [56, 74.619] + - - [1024, 512, 1, 16640] + - [55, 73.428] + - - [2048, 512, 1, 10240] + - [51, 74.732] + - - [2048, 256, 1, 16640] + - [56, 73.297] + - - [4096, 256, 1, 13312] + - [66, 62.311] + - - [4096, 200, 1, 15360] + - [59, 50.067] + - - [2048, 512, 1, 12288] + - [56, 75.075] + - - [4096, 256, 1, 8192] + - [51, 73.879] + - - [2048, 512, 1, 15360] + - [67, 73.464] + - - [2048, 512, 1, 11264] + - [56, 74.737] + - - [2048, 256, 1, 12288] + - [60, 72.269] + - - [1024, 1024, 1, 12288] + - [56, 75.138] + - - [4096, 256, 1, 16384] + - [53, 60.94] + - - [2048, 256, 1, 15360] + - [67, 72.571] + - - [2048, 512, 1, 8320] + - [24, 74.182] + - - [1024, 1024, 1, 10240] + - [51, 74.728] + - - [1024, 1024, 1, 9216] + - [32, 74.818] + - - [4096, 200, 1, 16384] + - [34, 48.632] + - - [2048, 512, 1, 14336] + - [60, 75.481] + - - [1024, 512, 1, 13312] + - [67, 72.124] + - - [4096, 256, 1, 8320] + - [15, 74.01] + - - [4096, 200, 1, 13312] + - [59, 51.163] + - - [1024, 512, 1, 14336] + - [67, 72.458] + - - [4096, 256, 1, 11264] + - [53, 62.081] + - - [4096, 256, 1, 10240] + - [34, 62.74] + - - [4096, 200, 1, 14336] + - [66, 48.934] + - - [4096, 256, 1, 16640] + - [58, 74.046] + - - [1024, 512, 1, 15360] + - [56, 73.704] + - - [1024, 1024, 1, 16384] + - [15, 74.344] + - - [224, 192, 36, 10368] + - [4, 60.556] + - - [320, 256, 9, 19584] + - [56, 71.953] + - - [256, 256, 11, 13056] + - [65, 65.361] + - - [320, 256, 9, 9792] + - [63, 69.453] + - - [320, 256, 11, 13056] + - [60, 64.536] + - - [256, 256, 9, 9792] + - [54, 77.936] + - - [256, 224, 9, 19584] + - [56, 66.909] + - - [256, 256, 9, 19584] + - [54, 75.341] + - - [128, 128, 36, 12000] + - [47, 56.866] + - - [128, 128, 49, 12800] + - [57, 49.597] + - - [128, 128, 25, 25088] + - [64, 46.746] + - - [128, 128, 49, 25600] + - [62, 43.628] + - - [128, 128, 25, 50176] + - [61, 43.534] + - - [128, 128, 36, 12544] + - [35, 56.365] + - - [128, 128, 49, 9216] + - [68, 40.493] + - - [1024, 1024, 1, 12544] + - [56, 75.188] + - - [1024, 1000, 1, 12544] + - [56, 73.69] + - - [1024, 512, 1, 1600] + - [121, 62.023] + - - [2048, 512, 1, 100] + - [73, 48.871] + - - [768, 640, 1, 768] + - [110, 51.393] + - - [768, 1280, 1, 768] + - [110, 60.137] + - - [1024, 512, 1, 1024] + - [110, 56.726] + - - [1024, 512, 1, 3072] + - [110, 61.396] + - - [30522, 120, 1, 1024] + - [87, 68.005] + - - [30522, 80, 1, 1024] + - [105, 46.877] + - - [64, 128, 512, 128] + - [86, 48.059] + - - [64, 512, 64, 512] + - [83, 49.435] + - - [64, 64, 768, 64] + - [76, 40.227] + - - [64, 64, 96, 64] + - [72, 24.575] + - - [1856, 448, 1, 3328] + - [108, 63.972] + - - [128, 6784, 1, 3328] + - [105, 55.882] + - - [2048, 400, 1, 512] + - [82, 51.005] + - - [2368, 448, 1, 128] + - [97, 49.011] + - - [256, 4288, 1, 3328] + - [136, 69.377] + - - [704, 1856, 1, 3328] + - [108, 61.45] + - - [448, 1024, 1, 1280] + - [136, 49.76] + - - [256, 1408, 1, 3328] + - [110, 42.758] + - - [704, 1856, 1, 1280] + - [85, 59.974] + - - [128, 5056, 1, 128] + - [103, 40.8] + - - [2368, 128, 1, 256] + - [129, 31.37] + - - [64, 5056, 1, 256] + - [133, 30.635] + - - [256, 2944, 1, 256] + - [85, 50.202] + - - [256, 1856, 1, 1280] + - [87, 51.871] + - - [128, 3584, 1, 1280] + - [110, 50.87] + - - [4288, 256, 1, 256] + - [105, 61.134] + - - [2944, 128, 1, 128] + - [102, 32.872] + - - [5888, 64, 1, 3328] + - [106, 41.012] + - - [2944, 256, 1, 3328] + - [108, 58.264] + - - [704, 1024, 1, 128] + - [129, 43.637] + - - [1408, 448, 1, 1280] + - [85, 47.955] + - - [1408, 704, 1, 3328] + - [136, 62.821] + - - [1408, 256, 1, 1280] + - [110, 40.199] + - - [3072, 128, 1, 1024] + - [87, 42.487] + - - [2944, 256, 1, 256] + - [85, 49.516] + - - [704, 1408, 1, 3328] + - [87, 62.884] + - - [2944, 256, 1, 128] + - [130, 45.244] + - - [2368, 128, 1, 3328] + - [110, 35.71] + - - [2944, 128, 1, 256] + - [102, 38.584] + - - [448, 1408, 1, 256] + - [103, 42.758] + - - [64, 5056, 1, 3328] + - [106, 35.782] + - - [1024, 448, 1, 128] + - [79, 35.322] + - - [256, 3584, 1, 3328] + - [82, 59.898] + - - [256, 1408, 1, 256] + - [102, 36.265] + - - [5056, 64, 1, 1280] + - [84, 34.555] + - - [1024, 704, 1, 256] + - [130, 47.689] + - - [128, 4288, 1, 128] + - [81, 41.49] + - - [6784, 64, 1, 128] + - [80, 32.313] + - - [3584, 256, 1, 128] + - [103, 47.648] + - - [5888, 64, 1, 256] + - [133, 35.273] + - - [1856, 256, 1, 1280] + - [87, 51.772] + - - [64, 5888, 1, 3328] + - [83, 41.517] + - - [704, 1024, 1, 1280] + - [85, 54.321] + - - [448, 1856, 1, 128] + - [83, 45.045] + - - [1024, 704, 1, 1280] + - [85, 54.691] + - - [128, 5888, 1, 256] + - [85, 49.516] + - - [704, 704, 1, 3328] + - [136, 57.57] + - - [704, 1408, 1, 1280] + - [105, 61.923] + - - [3584, 256, 1, 3328] + - [105, 60.832] + - - [704, 1856, 1, 128] + - [103, 53.59] + - - [2944, 448, 1, 128] + - [125, 51.393] + - - [128, 2944, 1, 1280] + - [110, 41.819] + - - [448, 2944, 1, 1280] + - [105, 59.275] + - - [3584, 128, 1, 256] + - [85, 38.12] + - - [448, 1408, 1, 3328] + - [85, 49.024] + - - [256, 3584, 1, 256] + - [136, 51.894] + - - [256, 2944, 1, 3328] + - [134, 58.368] + - - [448, 2368, 1, 128] + - [83, 48.857] + - - [1408, 704, 1, 256] + - [131, 55.201] + - - [448, 2944, 1, 3328] + - [87, 57.786] + - - [64, 5888, 1, 256] + - [133, 35.503] + - - [6784, 128, 1, 3328] + - [110, 55.269] + - - [704, 704, 1, 256] + - [82, 40.484] + - - [128, 4288, 1, 3328] + - [87, 63.304] + - - [448, 704, 1, 1280] + - [136, 34.316] + - - [128, 5056, 1, 1280] + - [85, 49.254] + - - [1024, 448, 1, 3328] + - [136, 53.338] + - - [1856, 704, 1, 1280] + - [85, 59.744] + - - [448, 1024, 1, 128] + - [129, 33.364] + - - [448, 2368, 1, 3328] + - [132, 58.594] + - - [5056, 64, 1, 128] + - [80, 29.2] + - - [1024, 700, 1, 512] + - [108, 49.782] + - - [704, 1024, 1, 256] + - [85, 46.674] + - - [128, 6784, 1, 1280] + - [82, 56.572] + - - [1856, 256, 1, 256] + - [101, 40.529] + - - [256, 4288, 1, 1280] + - [87, 67.685] + - - [256, 1856, 1, 128] + - [130, 36.978] + - - [7680, 64, 1, 2560] + - [134, 52.71] + - - [448, 1408, 1, 128] + - [102, 38.345] + - - [6784, 128, 1, 256] + - [131, 51.118] + - - [704, 448, 1, 256] + - [102, 30.107] + - - [704, 1408, 1, 128] + - [82, 48.925] + - - [4288, 128, 1, 1280] + - [87, 59.537] + - - [128, 2944, 1, 128] + - [80, 34.109] + - - [1024, 704, 1, 3328] + - [85, 55.95] + - - [128, 4288, 1, 256] + - [85, 45.74] + - - [704, 448, 1, 3328] + - [87, 37.041] + - - [448, 2368, 1, 1280] + - [132, 57.777] + - - [64, 6784, 1, 3328] + - [132, 44.513] + - - [2944, 256, 1, 1280] + - [134, 56.672] + - - [256, 2368, 1, 128] + - [81, 38.557] + - - [1856, 704, 1, 256] + - [131, 52.805] + - - [1408, 448, 1, 3328] + - [108, 49.065] + - - [1856, 448, 1, 1280] + - [134, 61.296] + - - [128, 5888, 1, 128] + - [103, 47.102] + - - [704, 1856, 1, 256] + - [134, 53.784] + - - [256, 2368, 1, 1280] + - [103, 46.981] + - - [2944, 448, 1, 256] + - [85, 56.297] + - - [1856, 448, 1, 128] + - [103, 45.993] + - - [2368, 128, 1, 1280] + - [102, 34.014] + - - [64, 6784, 1, 256] + - [83, 35.164] + - - [64, 5056, 1, 1280] + - [107, 34.542] + - - [2368, 256, 1, 1280] + - [85, 45.383] + - - [2368, 448, 1, 1280] + - [134, 60.985] + - - [128, 3584, 1, 256] + - [85, 38.395] + - - [704, 448, 1, 1280] + - [129, 35.449] + - - [128, 3584, 1, 3328] + - [87, 53.915] + - - [4288, 256, 1, 1280] + - [131, 69.282] + - - [4288, 128, 1, 3328] + - [136, 63.512] + - - [7680, 128, 1, 2560] + - [87, 62.126] + - - [1408, 256, 1, 128] + - [129, 31.253] + - - [256, 1408, 1, 1280] + - [110, 40.389] + - - [128, 2368, 1, 256] + - [102, 31.708] + - - [6784, 64, 1, 3328] + - [108, 47.247] + - - [128, 2944, 1, 3328] + - [110, 44.567] + - - [2944, 448, 1, 3328] + - [85, 62.311] + - - [256, 4288, 1, 256] + - [82, 61.31] + - - [5888, 128, 1, 256] + - [85, 49.07] + - - [5056, 64, 1, 256] + - [133, 30.635] + - - [1024, 704, 1, 128] + - [81, 44.197] + - - [128, 5056, 1, 3328] + - [85, 50.342] + - - [4288, 128, 1, 256] + - [104, 44.084] + - - [1408, 448, 1, 128] + - [101, 38.106] + - - [704, 448, 1, 128] + - [102, 28.194] + - - [3584, 256, 1, 256] + - [131, 53.094] + - - [128, 2944, 1, 256] + - [102, 36.635] + - - [128, 6784, 1, 128] + - [81, 46.177] + - - [448, 1856, 1, 256] + - [132, 50.707] + - - [3584, 128, 1, 3328] + - [136, 53.856] + - - [5888, 128, 1, 3328] + - [85, 58.246] + - - [1408, 704, 1, 1280] + - [131, 63.746] + - - [6784, 64, 1, 256] + - [85, 36.18] + - - [448, 2944, 1, 256] + - [82, 54.407] + - - [448, 2368, 1, 256] + - [132, 53.229] + - - [64, 6784, 1, 1280] + - [106, 43.258] + - - [128, 2368, 1, 3328] + - [87, 35.701] + - - [5056, 64, 1, 3328] + - [83, 35.773] + - - [64, 5888, 1, 128] + - [80, 33.689] + - - [5056, 128, 1, 3328] + - [85, 50.03] + - - [448, 704, 1, 256] + - [129, 31.19] + - - [2944, 128, 1, 3328] + - [87, 44.571] + - - [704, 704, 1, 128] + - [117, 36.049] + - - [2368, 128, 1, 128] + - [80, 27.264] + - - [5056, 128, 1, 128] + - [129, 39.423] + - - [448, 1024, 1, 3328] + - [110, 53.437] + - - [2368, 256, 1, 256] + - [129, 40.38] + - - [256, 2368, 1, 3328] + - [85, 47.233] + - - [256, 3584, 1, 128] + - [105, 48.433] + - - [4288, 256, 1, 128] + - [82, 53.094] + - - [2368, 256, 1, 128] + - [101, 37.971] + - - [256, 1856, 1, 256] + - [85, 40.353] + - - [256, 2944, 1, 128] + - [103, 46.011] + - - [1408, 256, 1, 3328] + - [87, 42.676] + - - [2368, 448, 1, 256] + - [106, 52.273] + - - [4288, 256, 1, 3328] + - [87, 69.611] + - - [1856, 704, 1, 128] + - [102, 48.307] + - - [4288, 128, 1, 128] + - [129, 39.721] + - - [1408, 448, 1, 256] + - [85, 41.585] + - - [6784, 64, 1, 1280] + - [85, 45.523] + - - [3584, 128, 1, 128] + - [80, 33.707] + - - [256, 2368, 1, 256] + - [103, 43.543] + - - [2944, 448, 1, 1280] + - [108, 61.247] + - - [448, 1408, 1, 1280] + - [85, 47.716] + - - [448, 1856, 1, 1280] + - [132, 57.069] + - - [1856, 256, 1, 128] + - [101, 35.358] + - - [2560, 128, 1, 2560] + - [136, 38.336] + - - [448, 1024, 1, 256] + - [103, 39.96] + - - [1024, 448, 1, 1280] + - [110, 49.575] + - - [128, 5056, 1, 256] + - [85, 42.911] + - - [448, 2944, 1, 128] + - [73, 50.175] + - - [128, 3584, 1, 128] + - [103, 35.417] + - - [1408, 256, 1, 256] + - [102, 36.139] + - - [128, 5888, 1, 3328] + - [134, 58.206] + - - [2368, 448, 1, 3328] + - [108, 62.79] + - - [128, 5888, 1, 1280] + - [85, 57.132] + - - [64, 5056, 1, 128] + - [80, 29.38] + - - [64, 6784, 1, 128] + - [129, 32.99] + - - [448, 704, 1, 128] + - [129, 28.37] + - - [1408, 704, 1, 128] + - [131, 49.011] + - - [2368, 256, 1, 3328] + - [108, 46.958] + - - [5888, 128, 1, 1280] + - [108, 56.803] + - - [256, 3584, 1, 1280] + - [87, 57.43] + - - [256, 1408, 1, 128] + - [129, 30.883] + - - [256, 4288, 1, 128] + - [81, 54.84] + - - [5888, 128, 1, 128] + - [103, 43.177] + - - [1856, 256, 1, 3328] + - [87, 55.467] + - - [64, 5888, 1, 1280] + - [133, 40.136] + - - [704, 704, 1, 1280] + - [87, 53.721] + - - [128, 2368, 1, 1280] + - [129, 35.164] + - - [3584, 256, 1, 1280] + - [131, 60.254] + - - [5888, 64, 1, 1280] + - [133, 40.033] + - - [3584, 128, 1, 1280] + - [87, 50.694] + - - [5056, 128, 1, 1280] + - [85, 48.474] + - - [448, 1856, 1, 3328] + - [106, 57.967] + - - [1024, 448, 1, 256] + - [103, 41.72] + - - [2944, 128, 1, 1280] + - [129, 42.225] + - - [128, 2368, 1, 128] + - [80, 27.783] + - - [256, 2944, 1, 1280] + - [85, 57.028] + - - [704, 1024, 1, 3328] + - [108, 55.738] + - - [128, 6784, 1, 256] + - [105, 51.33] + - - [256, 1856, 1, 3328] + - [136, 55.517] + - - [6784, 128, 1, 128] + - [131, 45.424] + - - [704, 1408, 1, 256] + - [105, 56.397] + - - [4096, 128, 1, 4096] + - [110, 61.689] + - - [5888, 64, 1, 128] + - [129, 31.0] + - - [5056, 128, 1, 256] + - [102, 42.469] + - - [6784, 128, 1, 1280] + - [105, 56.018] + - - [1856, 448, 1, 256] + - [106, 50.761] + - - [128, 4288, 1, 1280] + - [136, 59.559] + - - [448, 704, 1, 3328] + - [126, 36.996] + - - [1856, 704, 1, 3328] + - [85, 61.689] + - - [1024, 1024, 1, 3328] + - [87, 66.625] + - - [2048, 200, 1, 3200] + - [78, 47.337] + - - [2048, 256, 1, 3328] + - [110, 61.368] + - - [4096, 200, 1, 11264] + - [87, 47.838] + - - [2048, 512, 1, 1024] + - [105, 64.351] + - - [1024, 1024, 1, 64] + - [71, 44.657] + - - [512, 1024, 1, 1536] + - [110, 59.0] + - - [1024, 512, 1, 512] + - [85, 50.044] + - - [2048, 512, 1, 640] + - [105, 67.825] + - - [1024, 1024, 1, 512] + - [82, 63.11] + - - [2048, 256, 1, 2048] + - [110, 59.853] + - - [1024, 512, 1, 128] + - [81, 39.735] + - - [2048, 512, 1, 256] + - [82, 59.78] + - - [4096, 200, 1, 2560] + - [105, 54.371] + - - [1024, 1024, 1, 1152] + - [105, 68.691] + - - [2048, 200, 1, 32] + - [128, 17.393] + - - [512, 1024, 1, 2816] + - [110, 61.098] + - - [2048, 200, 1, 2080] + - [74, 48.894] + - - [2048, 200, 1, 1024] + - [87, 42.036] + - - [4096, 200, 1, 4096] + - [105, 52.002] + - - [1024, 512, 1, 11264] + - [110, 63.209] + - - [1024, 1024, 1, 1792] + - [105, 65.975] + - - [4096, 200, 1, 768] + - [131, 50.996] + - - [4096, 256, 1, 1024] + - [82, 66.038] + - - [1024, 512, 1, 256] + - [103, 45.46] + - - [1024, 512, 1, 1408] + - [126, 58.553] + - - [1024, 512, 1, 5632] + - [136, 62.469] + - - [4096, 200, 1, 256] + - [108, 43.723] + - - [512, 1024, 1, 3072] + - [110, 61.31] + - - [1024, 1024, 1, 4160] + - [96, 72.535] + - - [2048, 256, 1, 384] + - [130, 52.566] + - - [4096, 200, 1, 640] + - [82, 50.116] + - - [1024, 1024, 1, 7168] + - [87, 67.022] + - - [4096, 256, 1, 768] + - [105, 67.211] + - - [2048, 256, 1, 6656] + - [110, 62.578] + - - [2048, 200, 1, 3072] + - [87, 47.098] + - - [1024, 512, 1, 2816] + - [110, 61.12] + - - [4096, 256, 1, 7680] + - [136, 66.503] + - - [4096, 200, 1, 1024] + - [105, 51.646] + - - [2048, 200, 1, 1792] + - [87, 45.415] + - - [1024, 1024, 1, 2816] + - [110, 66.458] + - - [2048, 512, 1, 1536] + - [136, 65.704] + - - [4096, 256, 1, 3072] + - [110, 66.458] + - - [2048, 256, 1, 5632] + - [136, 62.37] + - - [1024, 512, 1, 6656] + - [136, 62.681] + - - [4096, 200, 1, 2080] + - [74, 55.449] + - - [2048, 200, 1, 13312] + - [87, 49.327] + - - [4096, 256, 1, 3584] + - [131, 67.008] + - - [2048, 256, 1, 8192] + - [136, 63.015] + - - [2048, 512, 1, 512] + - [82, 62.717] + - - [2048, 512, 1, 1152] + - [131, 68.745] + - - [2048, 200, 1, 9216] + - [87, 49.051] + - - [2048, 200, 1, 2560] + - [110, 46.39] + - - [2048, 256, 1, 4608] + - [110, 62.081] + - - [2048, 256, 1, 3584] + - [87, 61.621] + - - [1024, 512, 1, 640] + - [81, 54.655] + - - [2048, 512, 1, 768] + - [131, 66.228] + - - [2048, 200, 1, 1408] + - [131, 44.77] + - - [4096, 200, 1, 2048] + - [131, 53.36] + - - [1024, 1024, 1, 5632] + - [110, 66.968] + - - [2048, 512, 1, 3584] + - [87, 66.629] + - - [1024, 512, 1, 64] + - [120, 29.917] + - - [4096, 200, 1, 7680] + - [110, 52.133] + - - [1024, 1024, 1, 1280] + - [110, 65.154] + - - [2048, 200, 1, 896] + - [130, 44.896] + - - [2048, 256, 1, 32] + - [114, 21.503] + - - [2048, 256, 1, 1280] + - [136, 57.583] + - - [4096, 256, 1, 4096] + - [87, 66.534] + - - [2048, 256, 1, 11264] + - [136, 63.187] + - - [4096, 200, 1, 9216] + - [87, 51.33] + - - [1024, 512, 1, 4096] + - [110, 60.75] + - - [4096, 200, 1, 3840] + - [131, 52.313] + - - [1024, 1024, 1, 1920] + - [74, 69.359] + - - [2048, 200, 1, 7168] + - [87, 48.727] + - - [4096, 256, 1, 1152] + - [131, 69.002] + - - [2048, 256, 1, 1920] + - [110, 59.631] + - - [2048, 512, 1, 4160] + - [96, 72.661] + - - [2048, 512, 1, 5632] + - [87, 66.913] + - - [4096, 256, 1, 7168] + - [136, 65.533] + - - [4096, 200, 1, 128] + - [130, 40.091] + - - [2048, 200, 1, 5120] + - [110, 48.122] + - - [1024, 1024, 1, 6656] + - [87, 66.931] + - - [512, 1024, 1, 3200] + - [100, 61.522] + - - [2048, 256, 1, 1536] + - [110, 58.679] + - - [4096, 256, 1, 256] + - [131, 59.898] + - - [2048, 512, 1, 1408] + - [105, 69.255] + - - [1024, 512, 1, 2080] + - [96, 64.874] + - - [2048, 512, 1, 2304] + - [105, 67.035] + - - [4096, 200, 1, 512] + - [82, 48.474] + - - [2048, 200, 1, 1280] + - [87, 43.917] + - - [1024, 1024, 1, 2304] + - [105, 66.886] + - - [2048, 512, 1, 4608] + - [110, 66.796] + - - [4096, 256, 1, 6144] + - [87, 66.634] + - - [4096, 256, 1, 896] + - [105, 68.059] + - - [2048, 256, 1, 640] + - [130, 55.738] + - - [2048, 512, 1, 384] + - [105, 65.158] + - - [2048, 200, 1, 16384] + - [90, 41.923] + - - [4096, 200, 1, 10240] + - [110, 49.394] + - - [1024, 512, 1, 9216] + - [136, 62.496] + - - [4096, 200, 1, 1920] + - [131, 54.096] + - - [2048, 512, 1, 7680] + - [136, 67.031] + - - [1024, 512, 1, 3584] + - [110, 61.72] + - - [1024, 1024, 1, 32] + - [73, 30.156] + - - [2048, 512, 1, 1664] + - [105, 70.026] + - - [2048, 200, 1, 2048] + - [87, 45.708] + - - [1024, 1024, 1, 3584] + - [110, 66.679] + - - [4096, 256, 1, 6656] + - [110, 66.936] + - - [4096, 256, 1, 4160] + - [96, 72.553] + - - [2048, 256, 1, 3072] + - [110, 61.238] + - - [2048, 256, 1, 8320] + - [100, 62.988] + - - [1024, 512, 1, 3200] + - [126, 61.522] + - - [1024, 512, 1, 896] + - [105, 56.852] + - - [2048, 512, 1, 1280] + - [105, 67.419] + - - [4096, 200, 1, 64] + - [95, 32.313] + - - [1024, 1024, 1, 5120] + - [87, 66.859] + - - [2048, 512, 1, 6656] + - [87, 66.981] + - - [1024, 1024, 1, 128] + - [105, 53.685] + - - [512, 1024, 1, 1792] + - [126, 59.668] + - - [4096, 256, 1, 2816] + - [105, 69.53] + - - [1024, 1024, 1, 4096] + - [110, 66.616] + - - [2048, 200, 1, 4160] + - [74, 50.973] + - - [1024, 512, 1, 768] + - [136, 54.75] + - - [4096, 200, 1, 8320] + - [131, 53.288] + - - [2048, 512, 1, 896] + - [105, 67.432] + - - [4096, 200, 1, 7168] + - [82, 52.192] + - - [2048, 200, 1, 3840] + - [126, 47.102] + - - [1024, 1024, 1, 768] + - [105, 65.822] + - - [4096, 256, 1, 2304] + - [131, 68.876] + - - [2048, 200, 1, 16640] + - [85, 45.099] + - - [2048, 256, 1, 2816] + - [110, 59.189] + - - [1024, 512, 1, 384] + - [81, 53.432] + - - [2048, 200, 1, 7680] + - [87, 48.587] + - - [1024, 512, 1, 4608] + - [136, 62.221] + - - [4096, 200, 1, 32] + - [114, 24.138] + - - [4096, 200, 1, 3328] + - [131, 53.401] + - - [1024, 1024, 1, 1408] + - [105, 68.885] + - - [2048, 200, 1, 15360] + - [134, 43.854] + - - [512, 1024, 1, 2048] + - [110, 57.822] + - - [4096, 256, 1, 5632] + - [110, 66.697] + - - [2048, 256, 1, 1408] + - [105, 58.106] + - - [2048, 256, 1, 6144] + - [110, 62.596] + - - [4096, 256, 1, 3328] + - [131, 69.431] + - - [2048, 512, 1, 6144] + - [110, 66.999] + - - [2048, 512, 1, 3200] + - [105, 70.658] + - - [2048, 200, 1, 4608] + - [87, 47.648] + - - [1024, 1024, 1, 6144] + - [110, 66.954] + - - [4096, 256, 1, 1664] + - [131, 69.765] + - - [2048, 200, 1, 384] + - [103, 40.091] + - - [4096, 256, 1, 1792] + - [87, 65.831] + - - [2048, 512, 1, 2816] + - [105, 66.846] + - - [4096, 256, 1, 384] + - [131, 64.148] + - - [2048, 256, 1, 128] + - [81, 39.117] + - - [1024, 1024, 1, 640] + - [105, 66.868] + - - [4096, 200, 1, 5632] + - [82, 54.1] + - - [2048, 200, 1, 1152] + - [103, 44.152] + - - [4096, 256, 1, 512] + - [131, 64.558] + - - [1024, 1024, 1, 384] + - [105, 64.833] + - - [2048, 200, 1, 512] + - [130, 39.261] + - - [2048, 256, 1, 9216] + - [136, 63.06] + - - [2048, 256, 1, 1792] + - [110, 59.302] + - - [4096, 200, 1, 1792] + - [105, 54.023] + - - [2048, 200, 1, 1536] + - [87, 44.449] + - - [1024, 1024, 1, 3072] + - [87, 66.494] + - - [1024, 1024, 1, 2080] + - [96, 72.079] + - - [2048, 200, 1, 2304] + - [87, 46.232] + - - [2048, 256, 1, 7168] + - [136, 62.781] + - - [2048, 512, 1, 1792] + - [136, 65.903] + - - [1024, 1024, 1, 4608] + - [110, 66.873] + - - [512, 1024, 1, 1280] + - [136, 58.093] + - - [2048, 256, 1, 3200] + - [110, 61.319] + - - [1024, 512, 1, 3328] + - [110, 61.567] + - - [1024, 512, 1, 4160] + - [121, 66.219] + - - [4096, 200, 1, 6656] + - [82, 52.913] + - - [2048, 200, 1, 3328] + - [87, 47.134] + - - [1024, 1024, 1, 256] + - [82, 59.428] + - - [2048, 256, 1, 64] + - [80, 29.917] + - - [2048, 256, 1, 2304] + - [136, 60.25] + - - [4096, 200, 1, 8192] + - [136, 48.645] + - - [1024, 512, 1, 7168] + - [110, 62.131] + - - [1024, 512, 1, 1792] + - [110, 59.55] + - - [4096, 200, 1, 2816] + - [131, 54.664] + - - [1024, 1024, 1, 896] + - [131, 68.041] + - - [4096, 256, 1, 5120] + - [87, 66.778] + - - [4096, 256, 1, 2048] + - [110, 65.984] + - - [2048, 256, 1, 5120] + - [110, 62.248] + - - [2048, 256, 1, 7680] + - [110, 62.875] + - - [2048, 200, 1, 3584] + - [87, 47.382] + - - [1024, 512, 1, 1536] + - [110, 58.887] + - - [2048, 200, 1, 64] + - [117, 23.655] + - - [2048, 200, 1, 4096] + - [87, 47.761] + - - [1024, 1024, 1, 1536] + - [110, 65.691] + - - [4096, 256, 1, 32] + - [73, 29.683] + - - [4096, 256, 1, 1280] + - [105, 67.658] + - - [2048, 256, 1, 1024] + - [110, 56.252] + - - [1024, 512, 1, 1152] + - [100, 57.416] + - - [2048, 512, 1, 3328] + - [87, 66.629] + - - [4096, 200, 1, 3584] + - [131, 54.389] + - - [2048, 200, 1, 256] + - [103, 35.358] + - - [4096, 256, 1, 1920] + - [96, 69.72] + - - [2048, 256, 1, 1664] + - [78, 58.959] + - - [4096, 200, 1, 5120] + - [87, 52.034] + - - [1024, 512, 1, 8192] + - [136, 63.087] + - - [4096, 200, 1, 896] + - [105, 51.876] + - - [2048, 200, 1, 640] + - [103, 43.258] + - - [4096, 200, 1, 1408] + - [131, 53.581] + - - [2048, 200, 1, 5632] + - [87, 48.348] + - - [1024, 512, 1, 2560] + - [136, 60.845] + - - [4096, 200, 1, 1280] + - [105, 52.801] + - - [1024, 1024, 1, 2560] + - [87, 66.291] + - - [2048, 512, 1, 64] + - [71, 44.395] + - - [2048, 200, 1, 8192] + - [87, 48.961] + - - [2048, 512, 1, 3072] + - [136, 66.557] + - - [4096, 256, 1, 640] + - [105, 67.044] + - - [2048, 256, 1, 4096] + - [136, 61.892] + - - [4096, 200, 1, 1664] + - [105, 54.109] + - - [2048, 200, 1, 6656] + - [87, 48.65] + - - [512, 1024, 1, 768] + - [110, 54.849] + - - [2048, 200, 1, 8320] + - [78, 48.993] + - - [4096, 256, 1, 3840] + - [110, 66.701] + - - [1024, 1024, 1, 3200] + - [105, 70.387] + - - [4096, 256, 1, 4608] + - [87, 66.742] + - - [1024, 512, 1, 32] + - [115, 21.503] + - - [1024, 512, 1, 3840] + - [136, 61.86] + - - [2048, 512, 1, 1920] + - [121, 69.828] + - - [4096, 200, 1, 6144] + - [82, 52.304] + - - [2048, 200, 1, 2816] + - [87, 46.723] + - - [1024, 1024, 1, 3840] + - [110, 66.724] + - - [2048, 256, 1, 3840] + - [110, 61.756] + - - [1024, 512, 1, 7680] + - [110, 62.916] + - - [2048, 200, 1, 10240] + - [87, 49.115] + - - [2048, 512, 1, 5120] + - [110, 66.936] + - - [512, 1024, 1, 512] + - [110, 50.802] + - - [2048, 512, 1, 32] + - [96, 29.57] + - - [4096, 256, 1, 2560] + - [136, 66.313] + - - [4096, 256, 1, 64] + - [73, 44.265] + - - [2048, 200, 1, 768] + - [82, 41.404] + - - [2048, 512, 1, 2560] + - [87, 66.363] + - - [2048, 512, 1, 7168] + - [110, 67.053] + - - [2048, 512, 1, 128] + - [103, 52.566] + - - [4096, 200, 1, 2304] + - [105, 54.321] + - - [2048, 512, 1, 4096] + - [87, 66.67] + - - [2048, 256, 1, 2560] + - [110, 60.606] + - - [2048, 256, 1, 4160] + - [74, 66.061] + - - [1024, 512, 1, 1664] + - [136, 59.424] + - - [2048, 512, 1, 2080] + - [96, 72.102] + - - [2048, 512, 1, 3840] + - [136, 66.733] + - - [4096, 200, 1, 3072] + - [87, 51.61] + - - [1024, 1024, 1, 1664] + - [131, 69.58] + - - [512, 1024, 1, 2304] + - [136, 60.665] + - - [4096, 256, 1, 1408] + - [105, 69.503] + - - [2048, 256, 1, 1152] + - [131, 57.588] + - - [1024, 512, 1, 1280] + - [110, 58.093] + - - [2048, 200, 1, 12288] + - [136, 49.318] + - - [2048, 200, 1, 1664] + - [105, 45.383] + - - [4096, 200, 1, 4608] + - [105, 52.823] + - - [512, 1024, 1, 2560] + - [110, 60.922] + - - [4096, 200, 1, 384] + - [130, 49.178] + - - [2048, 200, 1, 128] + - [129, 29.421] + - - [2048, 200, 1, 11264] + - [87, 49.241] + - - [1024, 512, 1, 1920] + - [100, 59.884] + - - [4096, 256, 1, 1536] + - [131, 67.987] + - - [2048, 256, 1, 256] + - [130, 45.257] + - - [2048, 256, 1, 10240] + - [110, 63.196] + - - [1024, 512, 1, 5120] + - [110, 62.37] + - - [1024, 512, 1, 8320] + - [126, 63.096] + - - [1024, 512, 1, 10240] + - [110, 63.196] + - - [1024, 1024, 1, 2048] + - [110, 66.146] + - - [2048, 256, 1, 2080] + - [74, 64.536] + - - [4096, 256, 1, 128] + - [130, 51.934] + - - [2048, 256, 1, 896] + - [103, 57.159] + - - [4096, 200, 1, 1152] + - [105, 52.877] + - - [2048, 200, 1, 6144] + - [87, 48.506] + - - [1024, 1024, 1, 7680] + - [110, 66.977] + - - [2048, 200, 1, 1920] + - [78, 45.108] + - - [4096, 256, 1, 2080] + - [121, 71.786] + - - [2048, 200, 1, 14336] + - [136, 49.051] + - - [1024, 512, 1, 6144] + - [136, 61.802] + - - [1024, 512, 1, 2304] + - [110, 60.286] + - - [4096, 200, 1, 4160] + - [74, 56.333] + - - [4096, 200, 1, 1536] + - [105, 53.414] + - - [2048, 320, 1, 64] + - [115, 32.742] + - - [2048, 384, 1, 64] + - [71, 37.226] + - - [1024, 384, 1, 289] + - [94, 40.308] + - - [2048, 448, 1, 64] + - [71, 39.541] + - - [102, 101, 624, 64] + - [100, 35.223] + - - [101, 101, 624, 64] + - [110, 34.84] + - - [85, 85, 752, 64] + - [116, 28.722] + - - [112, 111, 576, 64] + - [75, 41.106] + - - [65, 65, 992, 64] + - [116, 19.229] + - - [77, 77, 816, 64] + - [71, 26.1] + - - [111, 111, 576, 64] + - [75, 38.007] + - - [84, 85, 752, 64] + - [71, 28.916] + - - [84, 84, 752, 64] + - [71, 28.907] + - - [71, 71, 896, 64] + - [116, 22.635] + - - [122, 122, 528, 64] + - [124, 40.362] + - - [78, 78, 816, 64] + - [116, 26.515] + - - [112, 112, 576, 64] + - [75, 41.512] + - - [77, 78, 816, 64] + - [93, 25.911] + - - [111, 112, 576, 64] + - [122, 37.772] + - - [92, 93, 688, 64] + - [78, 31.772] + - - [102, 102, 624, 64] + - [110, 35.449] + - - [99, 99, 624, 64] + - [78, 34.695] + - - [100, 102, 624, 64] + - [100, 35.223] + - - [123, 122, 528, 64] + - [98, 40.308] + - - [99, 102, 624, 64] + - [136, 34.695] + - - [93, 93, 688, 64] + - [78, 31.596] + - - [123, 123, 528, 64] + - [98, 40.281] + - - [100, 100, 624, 64] + - [87, 34.203] + - - [101, 102, 624, 64] + - [78, 35.128] + - - [102, 100, 624, 64] + - [87, 35.142] + - - [92, 92, 688, 64] + - [136, 31.537] + - - [3072, 128, 1, 4096] + - [136, 46.678] + - - [1728, 320, 1, 64] + - [72, 33.545] + - - [1440, 320, 1, 196] + - [73, 39.487] + - - [2592, 384, 1, 289] + - [74, 59.695] + - - [192, 80, 36, 10368] + - [136, 34.515] + - - [1280, 384, 1, 64] + - [123, 25.076] + - - [1280, 448, 1, 64] + - [129, 32.34] + - - [3456, 256, 1, 169] + - [118, 49.751] + - - [2304, 256, 1, 196] + - [95, 49.39] + - - [224, 192, 36, 2592] + - [77, 60.98] + - - [192, 128, 36, 1568] + - [131, 58.418] + - - [1296, 288, 1, 196] + - [72, 35.34] + - - [192, 64, 36, 6272] + - [85, 40.781] + - - [1728, 224, 1, 1225] + - [73, 44.034] + - - [1152, 384, 1, 64] + - [80, 28.383] + - - [1792, 256, 1, 289] + - [73, 45.108] + - - [1728, 384, 1, 169] + - [93, 44.071] + - - [1568, 256, 1, 289] + - [72, 41.607] + - - [1152, 448, 1, 64] + - [72, 32.119] + - - [1536, 256, 1, 64] + - [70, 25.69] + - - [1440, 320, 1, 49] + - [92, 24.49] + - - [1344, 512, 1, 64] + - [117, 36.658] + - - [1152, 256, 1, 196] + - [72, 37.15] + - - [1728, 192, 1, 1225] + - [92, 39.69] + - - [2048, 512, 1, 49] + - [73, 36.68] + - - [512, 2048, 1, 49] + - [71, 37.267] + - - [1728, 192, 1, 64] + - [92, 24.314] + - - [1536, 384, 1, 64] + - [120, 33.134] + - - [2048, 192, 1, 64] + - [92, 26.903] + - - [128, 96, 36, 1568] + - [95, 51.389] + - - [128, 128, 36, 3136] + - [121, 72.783] + - - [1280, 320, 1, 64] + - [129, 26.637] + - - [1792, 320, 1, 289] + - [73, 54.899] + - - [2880, 320, 1, 64] + - [93, 40.813] + - - [1728, 384, 1, 49] + - [94, 29.94] + - - [512, 1024, 1, 196] + - [73, 46.922] + - - [224, 192, 36, 5184] + - [125, 61.684] + - - [192, 80, 36, 20736] + - [136, 32.705] + - - [224, 192, 64, 4608] + - [108, 50.793] + - - [224, 192, 64, 2304] + - [108, 57.299] + - - [192, 80, 49, 14400] + - [96, 34.393] + - - [224, 192, 49, 6272] + - [134, 56.419] + - - [224, 192, 49, 3136] + - [77, 56.757] + - - [192, 80, 36, 41472] + - [140, 31.293] + - - [192, 80, 49, 28800] + - [87, 33.148] + - - [192, 80, 64, 9216] + - [137, 26.29] + - - [256, 224, 9, 9792] + - [87, 60.732] + - - [256, 256, 9, 4896] + - [74, 73.352] + - - [320, 256, 9, 4896] + - [93, 55.476] + - - [224, 192, 9, 19584] + - [136, 46.141] + - - [192, 192, 11, 3264] + - [72, 49.286] + - - [192, 192, 11, 6528] + - [122, 45.244] + - - [192, 192, 9, 4896] + - [74, 41.539] + - - [224, 192, 11, 6528] + - [125, 51.898] + - - [192, 192, 9, 19584] + - [110, 39.599] + - - [256, 224, 11, 13056] + - [85, 46.638] + - - [224, 192, 11, 13056] + - [108, 47.549] + - - [256, 256, 11, 3264] + - [118, 61.021] + - - [320, 256, 11, 6528] + - [136, 57.173] + - - [192, 192, 9, 9792] + - [129, 40.962] + - - [224, 224, 9, 9792] + - [87, 53.473] + - - [224, 192, 11, 3264] + - [95, 55.932] + - - [224, 224, 11, 6528] + - [85, 43.087] + - - [224, 224, 9, 19584] + - [78, 52.756] + - - [192, 192, 11, 13056] + - [133, 43.304] + - - [224, 224, 9, 4896] + - [74, 55.792] + - - [320, 256, 11, 3264] + - [121, 61.95] + - - [256, 256, 11, 6528] + - [77, 55.417] + - - [224, 192, 9, 4896] + - [74, 47.874] + - - [224, 224, 11, 13056] + - [134, 41.936] + - - [224, 224, 11, 3264] + - [95, 46.687] + - - [256, 224, 11, 6528] + - [99, 48.988] + - - [256, 224, 11, 3264] + - [95, 53.338] + - - [224, 192, 9, 9792] + - [100, 45.898] + - - [256, 224, 9, 4896] + - [74, 63.056] + - - [64, 64, 496, 64] + - [76, 34.673] + - - [135, 135, 32, 64] + - [93, 22.879] + - - [64, 65, 496, 64] + - [72, 25.884] + - - [65, 65, 472, 64] + - [93, 18.074] + - - [65, 65, 496, 64] + - [71, 17.925] + - - [70, 70, 216, 64] + - [91, 17.957] + - - [70, 71, 216, 64] + - [93, 18.584] + - - [71, 71, 216, 64] + - [70, 18.345] + - - [71, 71, 448, 64] + - [116, 21.322] + - - [77, 77, 248, 64] + - [93, 22.297] + - - [77, 77, 408, 64] + - [116, 23.953] + - - [77, 78, 248, 64] + - [71, 22.035] + - - [77, 78, 408, 64] + - [116, 24.332] + - - [78, 78, 248, 64] + - [116, 22.365] + - - [78, 78, 408, 64] + - [93, 24.616] + - - [80, 80, 152, 64] + - [115, 21.476] + - - [80, 84, 152, 64] + - [116, 22.41] + - - [84, 84, 152, 64] + - [93, 22.121] + - - [85, 85, 376, 64] + - [116, 27.124] + - - [93, 93, 344, 64] + - [116, 29.006] + - - [102, 102, 312, 64] + - [100, 30.955] + - - [112, 112, 288, 64] + - [122, 36.834] + - - [122, 122, 264, 64] + - [98, 36.252] + - - [123, 122, 264, 64] + - [124, 36.947] + - - [123, 123, 264, 64] + - [124, 36.965] + - - [511, 2048, 1, 2048] + - [110, 65.212] + - - [1024, 512, 1, 1025] + - [73, 60.525] + - - [512, 1023, 1, 1024] + - [87, 55.196] + - - [1025, 1024, 1, 1024] + - [110, 64.414] + - - [2048, 513, 1, 2048] + - [108, 61.558] + - - [1024, 1024, 1, 1025] + - [96, 69.751] + - - [960, 1024, 1, 1023] + - [96, 65.208] + - - [1024, 1024, 1, 1024] + - [105, 64.802] + - - [960, 1025, 1, 1024] + - [82, 60.565] + - - [2049, 512, 1, 2048] + - [87, 65.285] + - - [513, 1024, 1, 1024] + - [106, 49.169] + - - [512, 2048, 1, 2048] + - [82, 66.728] + - - [1024, 511, 1, 1024] + - [110, 53.699] + - - [1024, 512, 1, 1023] + - [73, 60.588] + - - [960, 1024, 1, 1025] + - [74, 65.203] + - - [959, 1024, 1, 1024] + - [82, 60.538] + - - [2048, 512, 1, 2049] + - [96, 71.551] + - - [511, 1024, 1, 1024] + - [110, 54.917] + - - [512, 2049, 1, 2048] + - [87, 65.781] + - - [1024, 513, 1, 1024] + - [85, 52.169] + - - [2048, 512, 1, 2047] + - [121, 71.502] + - - [1025, 512, 1, 1024] + - [87, 55.634] + - - [1024, 1024, 1, 1023] + - [96, 70.058] + - - [513, 2048, 1, 2048] + - [106, 55.395] + - - [1024, 1025, 1, 1024] + - [82, 65.352] + - - [512, 2048, 1, 2049] + - [96, 71.434] + - - [1024, 1023, 1, 1024] + - [105, 63.967] + - - [960, 1023, 1, 1024] + - [105, 59.843] + - - [2048, 511, 1, 2048] + - [105, 64.906] + - - [1023, 512, 1, 1024] + - [136, 54.998] + - - [2047, 512, 1, 2048] + - [131, 65.853] + - - [512, 1024, 1, 1024] + - [136, 56.726] + - - [512, 1024, 1, 1025] + - [73, 60.768] + - - [512, 2047, 1, 2048] + - [87, 65.461] + - - [512, 1025, 1, 1024] + - [87, 55.612] + - - [512, 2048, 1, 2047] + - [121, 71.533] + - - [960, 1024, 1, 1024] + - [82, 62.248] + - - [961, 1024, 1, 1024] + - [105, 61.373] + - - [512, 1024, 1, 1023] + - [73, 60.588] + - - [1023, 1024, 1, 1024] + - [105, 64.721] + - - [479, 1024, 1, 1024] + - [136, 51.668] + - - [479, 2048, 1, 2048] + - [105, 61.477] + - - [480, 1023, 1, 1024] + - [110, 51.443] + - - [480, 1024, 1, 1023] + - [73, 56.045] + - - [480, 1024, 1, 1025] + - [73, 56.374] + - - [480, 1025, 1, 1024] + - [110, 51.871] + - - [480, 2047, 1, 2048] + - [82, 63.223] + - - [480, 2048, 1, 2047] + - [121, 66.823] + - - [480, 2048, 1, 2049] + - [96, 66.823] + - - [480, 2049, 1, 2048] + - [136, 61.63] + - - [480, 3071, 1, 3072] + - [87, 64.342] + - - [481, 1024, 1, 1024] + - [110, 51.88] + - - [481, 2048, 1, 2048] + - [136, 61.558] + - - [1023, 480, 1, 1024] + - [110, 50.82] + - - [1024, 479, 1, 1024] + - [110, 50.991] + - - [1024, 480, 1, 1023] + - [73, 55.986] + - - [1024, 480, 1, 1025] + - [73, 55.932] + - - [1024, 481, 1, 1024] + - [87, 50.816] + - - [1025, 480, 1, 1024] + - [87, 51.289] + - - [2047, 480, 1, 2048] + - [105, 62.293] + - - [2048, 479, 1, 2048] + - [131, 62.239] + - - [2048, 480, 1, 2047] + - [121, 66.124] + - - [2048, 480, 1, 2049] + - [96, 66.119] + - - [2048, 481, 1, 2048] + - [105, 62.704] + - - [2049, 480, 1, 2048] + - [136, 60.967] + - - [3071, 480, 1, 3072] + - [110, 63.94] + - - [480, 1024, 1, 1024] + - [110, 51.989] + - - [480, 2048, 1, 2048] + - [105, 63.133] + - - [1024, 480, 1, 1024] + - [110, 50.87] + - - [2048, 480, 1, 2048] + - [136, 61.382] + - - [1024, 512, 1, 2048] + - [110, 60.164] + - - [1024, 960, 1, 1024] + - [82, 61.892] + - - [1024, 960, 1, 1600] + - [121, 66.534] + - - [1024, 1024, 1, 960] + - [96, 70.261] + - - [2048, 215, 1, 512] + - [103, 41.756] + - - [2048, 215, 1, 768] + - [134, 43.123] + - - [2048, 256, 1, 512] + - [131, 51.447] + - - [2048, 256, 1, 768] + - [105, 54.885] + - - [2048, 512, 1, 2048] + - [136, 66.119] + - - [2048, 512, 1, 67] + - [71, 42.491] + - - [2048, 512, 1, 74] + - [71, 45.0] + - - [256, 1280, 1, 1024] + - [102, 38.174] + - - [256, 1536, 1, 1024] + - [102, 45.144] + - - [256, 2304, 1, 1024] + - [110, 63.345] + - - [256, 2560, 1, 1024] + - [130, 49.489] + - - [256, 2816, 1, 1024] + - [85, 54.281] + - - [256, 3328, 1, 1024] + - [85, 63.566] + - - [256, 3584, 1, 1024] + - [82, 58.973] + - - [512, 1600, 1, 512] + - [85, 58.842] + - - [767, 1280, 1, 768] + - [82, 59.555] + - - [769, 1280, 1, 768] + - [131, 62.217] + - - [768, 1279, 1, 768] + - [82, 59.902] + - - [768, 1281, 1, 768] + - [105, 61.224] + - - [768, 1280, 1, 767] + - [96, 64.626] + - - [768, 1280, 1, 769] + - [96, 64.572] + - - [256, 4096, 1, 512] + - [82, 62.916] + - - [767, 768, 1, 768] + - [110, 58.616] + - - [769, 768, 1, 768] + - [130, 43.543] + - - [768, 767, 1, 768] + - [87, 57.561] + - - [768, 769, 1, 768] + - [134, 43.263] + - - [768, 768, 1, 767] + - [118, 65.461] + - - [768, 768, 1, 769] + - [118, 65.506] + - - [768, 768, 1, 768] + - [136, 61.188] + - - [128, 128, 49, 1152] + - [85, 60.317] + - - [128, 128, 49, 1216] + - [95, 65.817] + - - [128, 128, 36, 1800] + - [74, 71.533] + - - [128, 128, 36, 1900] + - [96, 70.933] + - - [128, 128, 64, 5880] + - [131, 54.957] + - - [128, 128, 49, 7680] + - [113, 40.136] + - - [128, 128, 64, 882] + - [74, 67.563] + - - [128, 128, 64, 931] + - [96, 68.908] + - - [128, 64, 121, 1152] + - [85, 57.552] + - - [128, 64, 81, 12000] + - [72, 34.348] + - - [128, 64, 121, 1216] + - [125, 56.415] + - - [128, 64, 81, 1800] + - [134, 50.342] + - - [128, 64, 81, 1900] + - [95, 55.386] + - - [128, 64, 49, 20280] + - [117, 37.465] + - - [128, 64, 49, 3042] + - [117, 48.19] + - - [128, 64, 49, 3211] + - [118, 48.623] + - - [128, 64, 169, 5880] + - [130, 36.694] + - - [128, 64, 121, 7680] + - [138, 32.196] + - - [128, 64, 169, 882] + - [98, 45.817] + - - [128, 64, 169, 931] + - [127, 46.25] + - - [256, 128, 25, 1080] + - [118, 66.706] + - - [256, 128, 25, 162] + - [73, 50.148] + - - [256, 128, 25, 171] + - [118, 50.4] + - - [1152, 256, 1, 1] + - [69, 0.749] + - - [1152, 256, 1, 1444] + - [72, 51.731] + - - [1152, 256, 1, 25] + - [69, 13.098] + - - [1152, 256, 1, 9] + - [69, 5.649] + - - [2304, 256, 1, 1444] + - [96, 69.841] + - - [2304, 340, 1, 1] + - [128, 1.263] + - - [2304, 340, 1, 1444] + - [121, 52.494] + - - [2304, 340, 1, 9] + - [114, 9.637] + - - [2304, 510, 1, 25] + - [114, 23.416] + - - [30522, 77, 1, 1024] + - [105, 45.383] + - - [1024, 780, 1, 1024] + - [134, 58.368] + - - [1024, 800, 1, 1024] + - [134, 60.092] + - - [1024, 820, 1, 1024] + - [134, 61.517] + - - [1024, 385, 1, 1024] + - [87, 41.95] + - - [1024, 462, 1, 1024] + - [87, 49.448] + - - [64, 512, 256, 512] + - [88, 41.061] + - - [64, 512, 128, 512] + - [135, 46.529] + - - [64, 512, 40, 512] + - [135, 45.902] + - - [96, 1024, 64, 1024] + - [85, 46.538] + - - [96, 1024, 128, 1024] + - [108, 47.504] + - - [64, 1024, 256, 1024] + - [111, 42.18] + - - [64, 1024, 32, 1024] + - [86, 45.185] + - - [64, 1024, 64, 1024] + - [88, 40.497] + - - [64, 1024, 128, 1024] + - [137, 42.121] + - - [64, 128, 1024, 128] + - [135, 51.014] + - - [1024, 864, 1, 1024] + - [82, 54.312] + - - [1024, 864, 1, 480] + - [96, 54.93] + - - [128, 3456, 1, 256] + - [105, 37.903] + - - [128, 4096, 1, 256] + - [85, 43.692] + - - [128, 6912, 1, 256] + - [82, 51.768] + - - [256, 3456, 1, 512] + - [82, 54.502] + - - [512, 864, 1, 1024] + - [136, 46.895] + - - [512, 864, 1, 13] + - [114, 9.754] + - - [64, 128, 1280, 128] + - [137, 30.31] + - - [64, 128, 1312, 128] + - [89, 29.362] + - - [64, 512, 192, 512] + - [137, 40.975] + - - [1024, 512, 1, 196] + - [73, 41.174] + - - [64, 128, 2048, 128] + - [112, 26.579] + - - [64, 128, 1536, 128] + - [139, 26.8] + - - [128, 128, 64, 6400] + - [113, 52.115] + - - [64, 128, 192, 128] + - [109, 41.327] + - - [64, 384, 144, 384] + - [132, 57.723] + - - [64, 512, 48, 512] + - [86, 46.353] + - - [64, 128, 256, 128] + - [107, 44.923] + - - [64, 384, 192, 384] + - [86, 49.683] + - - [128, 128, 49, 1120] + - [95, 64.576] + - - [128, 128, 49, 1064] + - [95, 65.948] + - - [128, 128, 49, 1040] + - [73, 65.88] + - - [128, 128, 64, 600] + - [121, 68.104] + - - [128, 128, 64, 616] + - [74, 68.059] + - - [128, 128, 49, 950] + - [95, 65.023] + - - [128, 128, 49, 972] + - [118, 65.172] + - - [128, 128, 64, 560] + - [96, 67.586] + - - [128, 128, 49, 1008] + - [95, 65.808] + - - [128, 128, 64, 532] + - [74, 66.476] + - - [128, 128, 49, 1080] + - [95, 65.988] + - - [128, 128, 64, 588] + - [121, 67.062] + - - [128, 128, 49, 1160] + - [95, 66.106] + - - [128, 128, 49, 988] + - [118, 65.23] + - - [128, 128, 49, 936] + - [73, 65.501] + - - [512, 1024, 1, 3800] + - [74, 65.966] + - - [512, 1024, 1, 3400] + - [121, 65.772] + - - [512, 1024, 1, 3456] + - [126, 61.707] + - - [2048, 512, 1, 950] + - [96, 70.026] + - - [512, 1024, 1, 3552] + - [96, 65.907] + - - [512, 1024, 1, 3220] + - [96, 65.325] + - - [2048, 512, 1, 850] + - [96, 69.526] + - - [512, 2048, 1, 864] + - [121, 70.279] + - - [512, 2048, 1, 768] + - [136, 64.328] + - - [2048, 512, 1, 805] + - [96, 69.395] + - - [512, 1024, 1, 2852] + - [121, 65.091] + - - [512, 2048, 1, 888] + - [121, 70.509] + - - [2048, 512, 1, 864] + - [96, 70.229] + - - [2048, 512, 1, 888] + - [96, 70.297] + - - [2048, 256, 1, 950] + - [73, 60.051] + - - [2048, 512, 1, 713] + - [96, 68.781] + - - [512, 1024, 1, 2688] + - [126, 61.089] + - - [512, 1024, 1, 2640] + - [121, 65.375] + - - [512, 1024, 1, 2904] + - [96, 65.47] + - - [1024, 512, 1, 950] + - [96, 60.601] + - - [512, 2048, 1, 672] + - [96, 69.656] + - - [512, 2048, 1, 660] + - [96, 68.899] + - - [512, 2048, 1, 1008] + - [96, 70.753] + - - [2048, 256, 1, 850] + - [73, 59.875] + - - [2048, 512, 1, 726] + - [121, 69.025] + - - [1024, 512, 1, 850] + - [73, 60.164] + - - [2048, 512, 1, 660] + - [121, 68.443] + - - [2048, 512, 1, 672] + - [96, 69.503] + - - [512, 2048, 1, 840] + - [121, 70.063] + - - [2048, 512, 1, 1008] + - [96, 70.73] + - - [512, 2048, 1, 792] + - [121, 70.085] + - - [1024, 512, 1, 805] + - [73, 59.618] + - - [512, 2048, 1, 1050] + - [121, 70.442] + - - [2048, 512, 1, 748] + - [96, 69.359] + - - [2048, 256, 1, 864] + - [74, 60.43] + - - [1024, 512, 1, 864] + - [96, 61.116] + - - [2048, 512, 1, 875] + - [96, 69.688] + - - [2048, 512, 1, 840] + - [96, 69.963] + - - [2048, 512, 1, 792] + - [96, 69.905] + - - [512, 2048, 1, 736] + - [96, 69.832] + - - [2048, 256, 1, 888] + - [73, 60.462] + - - [512, 2048, 1, 704] + - [121, 69.503] + - - [512, 2048, 1, 588] + - [96, 68.348] + - - [1024, 512, 1, 888] + - [121, 61.026] + - - [512, 2048, 1, 816] + - [121, 70.139] + - - [1024, 512, 1, 713] + - [73, 59.054] + - - [2048, 512, 1, 736] + - [121, 69.557] + - - [2048, 512, 1, 588] + - [96, 68.244] + - - [2048, 512, 1, 704] + - [96, 69.3] + - - [1024, 512, 1, 660] + - [73, 58.648] + - - [2048, 256, 1, 660] + - [73, 58.341] + - - [2048, 256, 1, 672] + - [73, 59.049] + - - [1024, 512, 1, 672] + - [73, 59.18] + - - [1024, 512, 1, 726] + - [73, 59.013] + - - [512, 2048, 1, 630] + - [96, 68.641] + - - [512, 2048, 1, 600] + - [121, 69.133] + - - [2048, 256, 1, 805] + - [73, 59.392] + - - [2048, 256, 1, 713] + - [73, 58.887] + - - [2048, 256, 1, 726] + - [73, 58.734] + - - [320, 1024, 1, 1024] + - [129, 36.513] + - - [1024, 1000, 1, 1024] + - [105, 63.299] + - - [320, 1000, 1, 1024] + - [102, 36.324] + - - [128, 128, 49, 1280] + - [85, 59.979] + - - [128, 128, 49, 1360] + - [95, 66.494] + - - [128, 128, 49, 1200] + - [118, 66.237] + - - [128, 128, 49, 1240] + - [95, 66.246] + - - [2304, 256, 1, 704] + - [74, 65.686] + - - [2304, 256, 1, 736] + - [73, 66.169] + - - [2304, 256, 1, 792] + - [74, 66.737] + - - [2304, 256, 1, 748] + - [95, 65.479] + - - [2304, 256, 1, 726] + - [73, 65.312] + - - [2304, 256, 1, 713] + - [73, 65.113] + - - [2304, 256, 1, 768] + - [87, 60.394] + - - [512, 2048, 1, 759] + - [121, 69.345] + - - [512, 2048, 1, 925] + - [96, 69.995] + - - [2304, 256, 1, 805] + - [73, 66.115] + - - [512, 2048, 1, 900] + - [121, 69.959] + - - [512, 2048, 1, 875] + - [96, 69.972] + - - [512, 2048, 1, 748] + - [96, 69.413] + - - [512, 2048, 1, 726] + - [121, 69.025] + - - [512, 2048, 1, 713] + - [96, 69.065] + - - [512, 2048, 1, 805] + - [96, 69.548] + - - [512, 2048, 1, 850] + - [121, 69.548] + - - [512, 2048, 1, 950] + - [96, 70.049] + - - [96, 1024, 160, 1024] + - [85, 47.865] + - - [96, 1024, 40, 1024] + - [85, 49.737] + - - [96, 1024, 80, 1024] + - [85, 46.552] + - - [96, 1024, 96, 1024] + - [87, 47.662] + - - [96, 1024, 24, 1024] + - [87, 51.488] + - - [96, 1024, 48, 1024] + - [134, 47.003] + - - [96, 1024, 16, 1024] + - [110, 50.667] + - - [96, 1024, 32, 1024] + - [136, 50.807] + - - [64, 512, 320, 512] + - [88, 41.512] + - - [64, 512, 80, 512] + - [86, 51.79] + - - [29000, 109, 1, 2560] + - [87, 54.235] + - - [29000, 121, 1, 2560] + - [110, 62.239] + - - [29000, 65, 1, 2560] + - [87, 34.763] + - - [29000, 66, 1, 2560] + - [136, 35.566] + - - [29000, 67, 1, 2560] + - [136, 35.625] + - - [29000, 69, 1, 2560] + - [87, 37.028] + - - [29000, 70, 1, 2560] + - [110, 37.736] + - - [29000, 71, 1, 2560] + - [110, 38.034] + - - [29000, 73, 1, 2560] + - [87, 39.153] + - - [29000, 74, 1, 2560] + - [110, 39.103] + - - [29000, 75, 1, 2560] + - [87, 39.369] + - - [29000, 77, 1, 2560] + - [87, 41.188] + - - [29000, 78, 1, 2560] + - [136, 41.539] + - - [29000, 80, 1, 2560] + - [87, 42.189] + - - [29000, 81, 1, 2560] + - [110, 43.358] + - - [29000, 82, 1, 2560] + - [136, 42.911] + - - [29000, 83, 1, 2560] + - [87, 44.404] + - - [29000, 84, 1, 2560] + - [136, 44.418] + - - [29000, 88, 1, 2560] + - [87, 46.516] + - - [29000, 89, 1, 2560] + - [110, 47.111] + - - [29000, 90, 1, 2560] + - [87, 47.102] + - - [29000, 92, 1, 2560] + - [87, 48.912] + - - [29000, 95, 1, 2560] + - [136, 48.691] + - - [29000, 98, 1, 2560] + - [136, 49.904] + - - [64, 1024, 512, 1024] + - [111, 42.758] + - - [1024, 200, 1, 13312] + - [147, 48.018] + - - [1024, 256, 1, 15360] + - [147, 61.531] + - - [1024, 256, 1, 16384] + - [148, 54.899] + - - [1024, 200, 1, 16384] + - [147, 44.788] + - - [1024, 256, 1, 12288] + - [147, 60.254] + - - [1024, 200, 1, 12288] + - [147, 47.472] + - - [1024, 200, 1, 15360] + - [147, 48.844] + - - [1024, 256, 1, 9216] + - [147, 58.743] + - - [1024, 200, 1, 14336] + - [147, 48.528] + - - [1024, 256, 1, 16640] + - [156, 62.32] + - - [1024, 200, 1, 8192] + - [147, 45.14] + - - [1024, 200, 1, 10240] + - [147, 46.809] + - - [1024, 200, 1, 9216] + - [147, 46.588] + - - [1024, 256, 1, 11264] + - [147, 60.511] + - - [1024, 200, 1, 8320] + - [147, 45.983] + - - [1024, 256, 1, 8320] + - [162, 58.007] + - - [1024, 200, 1, 16640] + - [162, 49.349] + - - [1024, 256, 1, 14336] + - [147, 61.242] + - - [1024, 256, 1, 13312] + - [147, 60.737] + - - [1024, 200, 1, 11264] + - [147, 47.603] + - - [1024, 256, 1, 8192] + - [147, 54.33] + - - [1024, 256, 1, 10240] + - [147, 59.054] + - - [96, 64, 64, 18432] + - [138, 28.252] + - - [96, 64, 36, 10368] + - [84, 32.782] + - - [96, 64, 36, 20736] + - [154, 32.71] + - - [96, 96, 36, 10368] + - [87, 35.945] + - - [96, 64, 49, 28800] + - [168, 32.114] + - - [96, 64, 36, 41472] + - [168, 32.43] + - - [64, 64, 11, 233600] + - [167, 24.467] + - - [64, 64, 11, 116800] + - [153, 25.293] + - - [64, 64, 9, 172864] + - [143, 28.988] + - - [64, 64, 11, 58400] + - [164, 27.702] + - - [192, 160, 9, 19584] + - [160, 39.103] + - - [128, 128, 9, 9792] + - [149, 57.619] + - - [192, 160, 11, 13056] + - [154, 38.314] + - - [64, 64, 9, 86432] + - [151, 28.104] + - - [128, 128, 9, 19584] + - [148, 52.832] + - - [160, 160, 11, 13056] + - [165, 33.224] + - - [160, 160, 9, 19584] + - [150, 36.053] + - - [192, 128, 9, 19584] + - [162, 48.839] + - - [192, 160, 9, 9792] + - [158, 46.593] + - - [64, 64, 9, 345728] + - [157, 28.577] + - - [128, 128, 11, 13056] + - [146, 47.08] + - - [160, 160, 9, 9792] + - [158, 39.374] + - - [192, 128, 11, 13056] + - [155, 43.836] + - - [192, 128, 9, 9792] + - [159, 53.852] + - - [128, 64, 25, 43320] + - [161, 36.816] + - - [64, 64, 64, 20280] + - [119, 28.401] + - - [64, 64, 49, 27000] + - [97, 27.919] + - - [64, 64, 36, 43320] + - [163, 28.713] + - - [64, 64, 36, 50176] + - [157, 28.072] + - - [64, 64, 49, 36864] + - [88, 27.088] + - - [64, 64, 64, 25600] + - [137, 26.949] + - - [256, 256, 1, 60800] + - [150, 44.364] + - - [256, 256, 1, 54400] + - [144, 44.359] + - - [256, 256, 1, 51520] + - [141, 46.047] + - - [256, 256, 1, 55296] + - [154, 44.305] + - - [256, 256, 1, 56832] + - [146, 44.495] + - - [256, 256, 1, 45632] + - [141, 45.758] + - - [256, 256, 1, 49152] + - [145, 40.299] + - - [256, 512, 1, 13600] + - [149, 56.798] + - - [256, 256, 1, 43008] + - [146, 43.773] + - - [256, 512, 1, 15200] + - [152, 57.538] + - - [256, 512, 1, 12880] + - [142, 55.981] + - - [256, 512, 1, 13824] + - [162, 52.954] + - - [512, 256, 1, 13824] + - [166, 52.255] + - - [256, 512, 1, 14208] + - [156, 53.784] + - - [512, 256, 1, 14208] + - [166, 53.468] + - - [512, 256, 1, 15200] + - [142, 57.619] + - - [256, 512, 1, 12288] + - [156, 51.434] + - - [512, 256, 1, 12288] + - [147, 51.122] + - - [1024, 200, 1, 560] + - [191, 20.402] + - - [768, 320, 1, 768] + - [215, 25.649] + - - [1024, 120, 1, 1024] + - [215, 20.966] + - - [1024, 128, 1, 128] + - [213, 15.078] + - - [2368, 64, 1, 3328] + - [197, 18.53] + - - [1408, 64, 1, 1280] + - [215, 15.904] + - - [4096, 32, 1, 4096] + - [179, 21.02] + - - [3072, 64, 1, 1024] + - [197, 22.035] + - - [2944, 64, 1, 256] + - [197, 19.712] + - - [6144, 32, 1, 2560] + - [181, 21.467] + - - [1856, 64, 1, 1280] + - [183, 20.501] + - - [704, 128, 1, 1280] + - [203, 16.563] + - - [4288, 64, 1, 3328] + - [199, 29.2] + - - [64, 3584, 1, 3328] + - [215, 24.742] + - - [704, 256, 1, 128] + - [213, 16.68] + - - [128, 1408, 1, 128] + - [181, 17.063] + - - [448, 448, 1, 256] + - [213, 21.034] + - - [7680, 32, 1, 2560] + - [213, 21.936] + - - [128, 1024, 1, 3328] + - [183, 24.287] + - - [64, 1856, 1, 1280] + - [199, 21.372] + - - [256, 1024, 1, 256] + - [199, 24.9] + - - [1024, 128, 1, 1280] + - [215, 23.118] + - - [3072, 32, 1, 1024] + - [179, 16.784] + - - [448, 256, 1, 3328] + - [199, 21.128] + - - [128, 1024, 1, 128] + - [181, 15.448] + - - [448, 448, 1, 3328] + - [197, 23.971] + - - [128, 704, 1, 1280] + - [211, 17.095] + - - [1856, 128, 1, 3328] + - [199, 25.609] + - - [35, 8457, 1, 1760] + - [174, 16.12] + - - [64, 2944, 1, 128] + - [181, 17.839] + - - [8448, 32, 1, 2816] + - [181, 23.303] + - - [1408, 128, 1, 1280] + - [213, 21.39] + - - [128, 1856, 1, 1280] + - [183, 25.117] + - - [2560, 64, 1, 2560] + - [213, 19.73] + - - [256, 448, 1, 256] + - [199, 16.896] + - - [128, 1856, 1, 128] + - [199, 20.357] + - - [2560, 32, 1, 2560] + - [211, 14.893] + - - [128, 1408, 1, 256] + - [181, 19.382] + - - [35, 8457, 1, 2560] + - [215, 16.12] + - - [4288, 64, 1, 128] + - [199, 23.24] + - - [256, 448, 1, 3328] + - [174, 21.183] + - - [64, 2368, 1, 1280] + - [204, 18.182] + - - [2368, 64, 1, 256] + - [197, 16.03] + - - [704, 128, 1, 3328] + - [188, 16.946] + - - [4288, 64, 1, 1280] + - [199, 28.825] + - - [1408, 128, 1, 128] + - [181, 16.734] + - - [128, 1024, 1, 1280] + - [183, 23.136] + - - [2944, 64, 1, 128] + - [181, 17.325] + - - [1024, 128, 1, 3328] + - [199, 23.971] + - - [704, 128, 1, 256] + - [199, 13.413] + - - [448, 256, 1, 1280] + - [183, 20.813] + - - [1856, 128, 1, 1280] + - [199, 25.144] + - - [64, 3584, 1, 256] + - [199, 22.04] + - - [3584, 64, 1, 128] + - [199, 18.245] + - - [256, 1024, 1, 1280] + - [199, 27.657] + - - [3584, 64, 1, 1280] + - [215, 24.075] + - - [64, 4288, 1, 3328] + - [183, 29.268] + - - [64, 1856, 1, 256] + - [199, 17.411] + - - [35, 8457, 1, 2048] + - [183, 16.098] + - - [256, 704, 1, 256] + - [197, 19.166] + - - [2368, 64, 1, 128] + - [181, 14.253] + - - [256, 1024, 1, 128] + - [215, 22.004] + - - [704, 256, 1, 3328] + - [213, 22.04] + - - [35, 8457, 1, 4096] + - [199, 15.593] + - - [64, 2944, 1, 256] + - [213, 18.827] + - - [448, 256, 1, 128] + - [199, 13.63] + - - [64, 1408, 1, 1280] + - [179, 16.851] + - - [1408, 128, 1, 256] + - [197, 18.958] + - - [64, 2944, 1, 1280] + - [189, 22.153] + - - [128, 704, 1, 128] + - [178, 11.121] + - - [64, 1408, 1, 3328] + - [179, 17.325] + - - [256, 448, 1, 1280] + - [215, 20.673] + - - [704, 256, 1, 1280] + - [197, 21.548] + - - [64, 2368, 1, 3328] + - [204, 18.724] + - - [1856, 64, 1, 128] + - [181, 13.828] + - - [4096, 64, 1, 4096] + - [183, 27.9] + - - [1760, 128, 1, 1760] + - [174, 24.052] + - - [704, 128, 1, 128] + - [178, 11.216] + - - [256, 704, 1, 3328] + - [197, 22.229] + - - [256, 448, 1, 128] + - [199, 13.301] + - - [64, 3584, 1, 128] + - [199, 19.712] + - - [64, 2944, 1, 3328] + - [172, 22.744] + - - [1024, 128, 1, 256] + - [215, 18.597] + - - [2944, 64, 1, 1280] + - [197, 22.324] + - - [128, 1408, 1, 3328] + - [197, 22.193] + - - [1408, 64, 1, 256] + - [178, 13.012] + - - [64, 1856, 1, 128] + - [181, 14.059] + - - [64, 2368, 1, 256] + - [181, 16.36] + - - [1856, 128, 1, 128] + - [183, 20.294] + - - [2368, 64, 1, 1280] + - [197, 18.25] + - - [4288, 64, 1, 256] + - [183, 26.109] + - - [64, 4288, 1, 1280] + - [215, 28.92] + - - [1408, 64, 1, 3328] + - [188, 16.693] + - - [64, 1408, 1, 128] + - [169, 11.027] + - - [256, 704, 1, 128] + - [181, 16.68] + - - [1408, 64, 1, 128] + - [178, 11.027] + - - [448, 448, 1, 1280] + - [197, 23.975] + - - [128, 1024, 1, 256] + - [183, 19.017] + - - [3584, 64, 1, 3328] + - [215, 24.625] + - - [256, 1024, 1, 3328] + - [183, 28.009] + - - [1856, 64, 1, 3328] + - [183, 21.422] + - - [448, 256, 1, 256] + - [199, 17.072] + - - [4608, 32, 1, 1536] + - [179, 22.739] + - - [128, 704, 1, 256] + - [179, 13.346] + - - [64, 3584, 1, 1280] + - [199, 24.341] + - - [3584, 64, 1, 256] + - [199, 21.435] + - - [64, 1856, 1, 3328] + - [215, 21.625] + - - [2048, 128, 1, 2048] + - [199, 27.855] + - - [1408, 128, 1, 3328] + - [197, 22.022] + - - [128, 704, 1, 3328] + - [211, 17.799] + - - [128, 1856, 1, 256] + - [183, 22.527] + - - [64, 4288, 1, 256] + - [199, 26.281] + - - [1856, 64, 1, 256] + - [199, 16.978] + - - [256, 704, 1, 1280] + - [197, 21.774] + - - [64, 2368, 1, 128] + - [181, 14.442] + - - [64, 4288, 1, 128] + - [199, 23.515] + - - [1856, 128, 1, 256] + - [199, 22.983] + - - [2048, 64, 1, 2048] + - [197, 23.032] + - - [64, 1408, 1, 256] + - [179, 13.445] + - - [2944, 64, 1, 3328] + - [197, 22.87] + - - [128, 1408, 1, 1280] + - [213, 21.71] + - - [128, 1856, 1, 3328] + - [183, 25.514] + - - [1760, 64, 1, 1760] + - [191, 20.939] + - - [448, 448, 1, 128] + - [181, 18.457] + - - [704, 256, 1, 256] + - [213, 18.958] + - - [256, 1024, 1, 196] + - [191, 23.655] + - - [1024, 256, 1, 1536] + - [215, 27.607] + - - [1024, 200, 1, 1408] + - [191, 21.805] + - - [1024, 200, 1, 6144] + - [183, 22.04] + - - [1024, 256, 1, 3328] + - [199, 28.185] + - - [512, 256, 1, 3200] + - [174, 24.819] + - - [1024, 200, 1, 4608] + - [199, 22.148] + - - [512, 256, 1, 1792] + - [215, 23.583] + - - [1024, 200, 1, 1792] + - [199, 21.904] + - - [512, 200, 1, 2816] + - [174, 19.08] + - - [512, 200, 1, 3072] + - [215, 18.864] + - - [1024, 200, 1, 128] + - [191, 17.343] + - - [1024, 200, 1, 5120] + - [215, 22.116] + - - [1024, 256, 1, 256] + - [199, 24.778] + - - [512, 256, 1, 2560] + - [215, 23.664] + - - [1024, 256, 1, 4160] + - [191, 28.442] + - - [1024, 200, 1, 512] + - [215, 20.898] + - - [512, 512, 1, 1536] + - [199, 27.77] + - - [1024, 256, 1, 896] + - [215, 27.413] + - - [1024, 200, 1, 3200] + - [215, 22.121] + - - [1024, 200, 1, 1536] + - [215, 21.787] + - - [1024, 256, 1, 1024] + - [215, 27.291] + - - [128, 1024, 1, 512] + - [183, 21.656] + - - [1024, 256, 1, 5120] + - [183, 27.923] + - - [1024, 200, 1, 2304] + - [199, 21.99] + - - [1024, 256, 1, 1664] + - [199, 27.964] + - - [512, 512, 1, 1024] + - [199, 27.436] + - - [1024, 256, 1, 2080] + - [191, 28.18] + - - [512, 200, 1, 768] + - [183, 17.83] + - - [1024, 256, 1, 2816] + - [199, 28.14] + - - [1024, 200, 1, 64] + - [191, 14.424] + - - [512, 512, 1, 2304] + - [199, 27.873] + - - [128, 1024, 1, 2048] + - [183, 23.56] + - - [512, 200, 1, 2560] + - [183, 18.931] + - - [512, 256, 1, 1024] + - [183, 23.434] + - - [1024, 256, 1, 1920] + - [199, 27.995] + - - [512, 200, 1, 2304] + - [174, 18.769] + - - [1024, 256, 1, 384] + - [199, 26.132] + - - [1024, 256, 1, 32] + - [191, 13.233] + - - [1024, 200, 1, 2816] + - [215, 22.089] + - - [1024, 200, 1, 3072] + - [199, 21.999] + - - [512, 256, 1, 1536] + - [183, 23.506] + - - [1024, 256, 1, 512] + - [215, 26.488] + - - [256, 512, 1, 512] + - [199, 21.877] + - - [1024, 200, 1, 3840] + - [199, 21.936] + - - [256, 1024, 1, 512] + - [199, 26.745] + - - [1024, 256, 1, 1152] + - [199, 27.684] + - - [512, 512, 1, 2816] + - [199, 27.977] + - - [512, 200, 1, 1280] + - [199, 18.597] + - - [512, 200, 1, 3200] + - [191, 19.274] + - - [1024, 256, 1, 2304] + - [199, 28.031] + - - [1024, 256, 1, 6144] + - [183, 28.085] + - - [1024, 200, 1, 2560] + - [215, 22.031] + - - [1024, 256, 1, 5632] + - [215, 28.117] + - - [512, 256, 1, 768] + - [199, 22.938] + - - [1024, 256, 1, 3072] + - [215, 27.968] + - - [256, 512, 1, 2048] + - [183, 23.574] + - - [1024, 200, 1, 1152] + - [215, 21.715] + - - [512, 512, 1, 3072] + - [183, 28.049] + - - [1024, 200, 1, 1664] + - [215, 21.895] + - - [1024, 200, 1, 32] + - [191, 9.727] + - - [1024, 200, 1, 384] + - [215, 20.533] + - - [512, 256, 1, 2304] + - [215, 23.813] + - - [256, 512, 1, 1024] + - [183, 23.29] + - - [1024, 200, 1, 3328] + - [215, 22.125] + - - [1024, 200, 1, 2080] + - [191, 22.067] + - - [512, 200, 1, 1792] + - [174, 18.629] + - - [1024, 256, 1, 1792] + - [199, 27.91] + - - [1024, 200, 1, 7168] + - [183, 22.076] + - - [512, 256, 1, 3072] + - [215, 23.669] + - - [1024, 200, 1, 2048] + - [215, 21.909] + - - [512, 512, 1, 1280] + - [199, 27.657] + - - [1024, 200, 1, 1280] + - [215, 21.679] + - - [512, 200, 1, 512] + - [183, 16.896] + - - [1024, 256, 1, 2560] + - [199, 27.964] + - - [1024, 200, 1, 1024] + - [183, 21.417] + - - [1024, 256, 1, 3200] + - [199, 28.266] + - - [512, 512, 1, 2560] + - [199, 28.022] + - - [1024, 256, 1, 640] + - [199, 26.994] + - - [1024, 256, 1, 3584] + - [199, 28.063] + - - [512, 512, 1, 3200] + - [191, 28.289] + - - [1024, 256, 1, 7680] + - [215, 27.639] + - - [512, 200, 1, 1536] + - [183, 18.539] + - - [512, 256, 1, 2816] + - [215, 23.993] + - - [1024, 200, 1, 768] + - [215, 21.386] + - - [512, 200, 1, 2048] + - [183, 18.584] + - - [1024, 256, 1, 128] + - [215, 21.814] + - - [1024, 200, 1, 4096] + - [215, 22.058] + - - [1024, 256, 1, 1280] + - [199, 27.648] + - - [1024, 200, 1, 896] + - [199, 21.525] + - - [1024, 256, 1, 4608] + - [215, 27.995] + - - [128, 1024, 1, 1024] + - [183, 23.059] + - - [1024, 256, 1, 2048] + - [199, 27.846] + - - [512, 256, 1, 1280] + - [183, 23.565] + - - [256, 1024, 1, 2048] + - [183, 27.86] + - - [512, 512, 1, 2048] + - [183, 27.81] + - - [512, 256, 1, 512] + - [199, 22.198] + - - [1024, 200, 1, 7680] + - [215, 21.922] + - - [1024, 200, 1, 6656] + - [199, 22.162] + - - [512, 200, 1, 1024] + - [183, 18.381] + - - [1024, 256, 1, 3840] + - [215, 27.837] + - - [512, 512, 1, 768] + - [199, 27.341] + - - [1024, 256, 1, 64] + - [191, 18.552] + - - [1024, 200, 1, 1920] + - [174, 21.904] + - - [1024, 256, 1, 7168] + - [183, 28.09] + - - [512, 512, 1, 1792] + - [199, 27.779] + - - [1024, 200, 1, 256] + - [215, 19.68] + - - [256, 1024, 1, 1024] + - [199, 27.377] + - - [1024, 200, 1, 640] + - [215, 21.241] + - - [1024, 200, 1, 4160] + - [191, 22.238] + - - [1024, 200, 1, 5632] + - [199, 22.157] + - - [1024, 256, 1, 6656] + - [183, 28.144] + - - [1024, 256, 1, 768] + - [215, 27.21] + - - [512, 256, 1, 2048] + - [183, 23.673] + - - [1024, 200, 1, 3584] + - [215, 22.121] + - - [1024, 256, 1, 1408] + - [191, 27.828] + - - [1024, 256, 1, 4096] + - [183, 27.891] + - - [1024, 128, 1, 289] + - [191, 18.949] + - - [768, 192, 1, 289] + - [191, 21.507] + - - [32, 32, 1984, 64] + - [176, 16.847] + - - [54, 54, 1184, 64] + - [183, 22.175] + - - [35, 35, 1808, 64] + - [215, 9.624] + - - [45, 45, 1424, 64] + - [191, 15.8] + - - [49, 49, 1296, 64] + - [183, 18.444] + - - [59, 59, 1088, 64] + - [174, 26.312] + - - [41, 41, 1552, 64] + - [191, 13.107] + - - [38, 38, 1680, 64] + - [183, 11.383] + - - [2048, 128, 1, 4096] + - [199, 27.991] + - - [1024, 128, 1, 1024] + - [199, 22.816] + - - [1152, 128, 1, 784] + - [199, 25.32] + - - [864, 96, 1, 1225] + - [170, 15.498] + - - [896, 192, 1, 289] + - [204, 18.755] + - - [768, 128, 1, 289] + - [170, 14.501] + - - [1344, 192, 1, 289] + - [174, 24.999] + - - [384, 192, 1, 1225] + - [205, 15.529] + - - [832, 192, 1, 49] + - [204, 10.206] + - - [1280, 192, 1, 64] + - [191, 17.307] + - - [512, 256, 1, 196] + - [172, 16.896] + - - [864, 96, 1, 289] + - [170, 12.782] + - - [896, 128, 1, 289] + - [215, 16.878] + - - [1200, 64, 1, 1225] + - [188, 14.546] + - - [1024, 256, 1, 289] + - [191, 24.986] + - - [1024, 256, 1, 196] + - [191, 23.849] + - - [1120, 192, 1, 289] + - [174, 20.957] + - - [800, 96, 1, 784] + - [188, 14.343] + - - [864, 128, 1, 784] + - [199, 18.972] + - - [1344, 224, 1, 289] + - [204, 20.749] + - - [1152, 192, 1, 784] + - [174, 23.244] + - - [800, 128, 1, 196] + - [188, 13.806] + - - [864, 208, 1, 196] + - [204, 18.142] + - - [720, 192, 1, 5041] + - [174, 25.347] + - - [576, 192, 1, 3136] + - [174, 21.264] + - - [832, 256, 1, 49] + - [204, 13.228] + - - [1200, 128, 1, 49] + - [174, 9.538] + - - [528, 256, 1, 196] + - [215, 17.474] + - - [256, 512, 1, 784] + - [199, 23.114] + - - [480, 192, 1, 196] + - [179, 12.615] + - - [96, 64, 36, 2592] + - [206, 23.777] + - - [96, 96, 36, 2592] + - [170, 21.634] + - - [1024, 192, 1, 289] + - [204, 21.151] + - - [528, 160, 1, 196] + - [169, 11.6] + - - [512, 160, 1, 196] + - [169, 11.685] + - - [768, 160, 1, 289] + - [172, 17.722] + - - [64, 32, 36, 43808] + - [187, 13.454] + - - [832, 160, 1, 49] + - [173, 8.315] + - - [2048, 64, 1, 1001] + - [191, 21.417] + - - [2048, 128, 1, 1001] + - [191, 27.129] + - - [1536, 64, 1, 1001] + - [203, 18.02] + - - [96, 96, 49, 3136] + - [203, 22.189] + - - [64, 32, 49, 57600] + - [217, 13.003] + - - [96, 64, 49, 6272] + - [183, 20.009] + - - [64, 32, 49, 115200] + - [201, 11.961] + - - [96, 96, 64, 2304] + - [197, 20.632] + - - [96, 96, 49, 6272] + - [189, 18.787] + - - [96, 64, 36, 5184] + - [199, 23.858] + - - [64, 32, 64, 40000] + - [189, 18.381] + - - [96, 64, 64, 4608] + - [215, 19.486] + - - [96, 96, 36, 5184] + - [188, 21.701] + - - [96, 64, 64, 2304] + - [215, 21.219] + - - [96, 64, 49, 3136] + - [191, 21.765] + - - [64, 32, 36, 87616] + - [214, 13.296] + - - [64, 32, 64, 80000] + - [209, 15.263] + - - [96, 96, 64, 4608] + - [197, 18.119] + - - [64, 32, 36, 175232] + - [198, 12.362] + - - [128, 128, 11, 3264] + - [188, 22.211] + - - [192, 128, 11, 6528] + - [191, 29.128] + - - [128, 128, 11, 6528] + - [189, 22.283] + - - [160, 160, 9, 4896] + - [172, 21.318] + - - [192, 160, 11, 6528] + - [189, 24.963] + - - [192, 128, 9, 4896] + - [191, 24.079] + - - [128, 128, 9, 4896] + - [191, 28.334] + - - [192, 128, 11, 3264] + - [174, 29.213] + - - [160, 160, 11, 3264] + - [203, 22.929] + - - [192, 160, 9, 4896] + - [204, 25.55] + - - [192, 160, 11, 3264] + - [172, 24.95] + - - [160, 160, 11, 6528] + - [195, 22.879] + - - [4096, 64, 1, 1024] + - [215, 27.156] + - - [49, 49, 160, 64] + - [191, 13.269] + - - [54, 54, 592, 64] + - [183, 20.736] + - - [59, 59, 512, 64] + - [206, 24.183] + - - [104, 104, 16, 64] + - [174, 11.568] + - - [32, 32, 624, 64] + - [173, 14.063] + - - [32, 32, 992, 64] + - [176, 15.182] + - - [35, 35, 384, 64] + - [183, 8.509] + - - [35, 35, 904, 64] + - [183, 9.362] + - - [38, 38, 320, 64] + - [199, 9.899] + - - [38, 38, 840, 64] + - [191, 11.027] + - - [41, 41, 312, 64] + - [174, 11.252] + - - [41, 41, 776, 64] + - [174, 12.488] + - - [45, 45, 392, 64] + - [183, 14.063] + - - [45, 45, 712, 64] + - [199, 15.006] + - - [49, 49, 648, 64] + - [215, 17.591] + - - [54, 54, 200, 64] + - [215, 17.045] + - - [59, 59, 544, 64] + - [206, 24.675] + - - [91, 91, 40, 64] + - [203, 14.582] + - - [91, 93, 40, 64] + - [203, 15.038] + - - [93, 93, 40, 64] + - [203, 15.227] + - - [102, 102, 56, 64] + - [174, 15.75] + - - [103, 103, 16, 64] + - [174, 11.189] + - - [103, 104, 16, 64] + - [191, 11.351] + - - [112, 112, 16, 64] + - [191, 13.603] + - - [112, 123, 16, 64] + - [191, 15.011] + - - [119, 119, 32, 64] + - [174, 18.909] + - - [119, 135, 32, 64] + - [189, 17.713] + - - [123, 123, 16, 64] + - [174, 15.263] + - - [512, 512, 1, 512] + - [199, 26.867] + - - [513, 512, 1, 512] + - [183, 25.839] + - - [512, 512, 1, 513] + - [199, 26.565] + - - [512, 512, 1, 511] + - [191, 26.813] + - - [512, 513, 1, 512] + - [215, 26.132] + - - [512, 511, 1, 512] + - [199, 26.321] + - - [511, 512, 1, 512] + - [183, 26.231] + - - [479, 512, 1, 512] + - [199, 24.783] + - - [480, 511, 1, 512] + - [215, 24.787] + - - [480, 512, 1, 511] + - [174, 25.162] + - - [480, 512, 1, 513] + - [174, 25.058] + - - [480, 513, 1, 512] + - [183, 24.905] + - - [481, 512, 1, 512] + - [183, 24.887] + - - [511, 480, 1, 512] + - [183, 24.508] + - - [512, 479, 1, 512] + - [199, 24.805] + - - [512, 480, 1, 511] + - [191, 25.004] + - - [512, 480, 1, 513] + - [191, 25.036] + - - [512, 481, 1, 512] + - [199, 24.95] + - - [513, 480, 1, 512] + - [199, 24.499] + - - [480, 512, 1, 512] + - [183, 24.923] + - - [512, 480, 1, 512] + - [199, 24.724] + - - [512, 512, 1, 64] + - [191, 18.462] + - - [2048, 114, 1, 512] + - [199, 23.551] + - - [2048, 114, 1, 768] + - [199, 24.192] + - - [256, 684, 1, 1024] + - [213, 20.903] + - - [33, 33, 1600, 32] + - [191, 8.27] + - - [383, 384, 1, 384] + - [215, 22.874] + - - [385, 384, 1, 384] + - [181, 16.459] + - - [384, 383, 1, 384] + - [215, 22.956] + - - [384, 385, 1, 384] + - [181, 16.459] + - - [384, 384, 1, 383] + - [215, 23.335] + - - [384, 384, 1, 385] + - [215, 23.371] + - - [384, 384, 1, 384] + - [199, 23.741] + - - [128, 64, 25, 6498] + - [204, 25.23] + - - [128, 64, 25, 6859] + - [189, 24.819] + - - [64, 64, 64, 3042] + - [191, 28.076] + - - [64, 64, 64, 3211] + - [183, 27.837] + - - [64, 64, 49, 4050] + - [189, 24.539] + - - [64, 64, 49, 4275] + - [204, 24.58] + - - [64, 64, 36, 6498] + - [215, 27.738] + - - [64, 64, 36, 6859] + - [215, 25.979] + - - [1152, 128, 1, 1444] + - [174, 25.387] + - - [512, 256, 1, 361] + - [172, 19.838] + - - [576, 128, 1, 1444] + - [190, 15.583] + - - [1024, 308, 1, 1024] + - [197, 22.45] + - - [1024, 160, 1, 1024] + - [179, 19.581] + - - [1024, 180, 1, 1024] + - [213, 21.48] + - - [32, 32, 4608, 64] + - [208, 17.636] + - - [32, 35, 4608, 64] + - [203, 14.298] + - - [34, 34, 4736, 64] + - [199, 9.29] + - - [35, 35, 4608, 64] + - [174, 9.863] + - - [128, 864, 1, 256] + - [197, 15.85] + - - [256, 864, 1, 512] + - [199, 22.098] + - - [512, 256, 1, 784] + - [199, 23.159] + - - [1024, 96, 1, 1024] + - [179, 18.151] + - - [1024, 256, 1, 3800] + - [199, 27.991] + - - [1024, 256, 1, 3400] + - [199, 27.955] + - - [256, 1024, 1, 3400] + - [183, 28.162] + - - [1024, 256, 1, 3220] + - [215, 28.004] + - - [256, 1024, 1, 3220] + - [191, 28.162] + - - [1024, 256, 1, 3456] + - [199, 28.248] + - - [256, 1024, 1, 3456] + - [199, 28.221] + - - [256, 1024, 1, 3072] + - [199, 27.788] + - - [1024, 256, 1, 3552] + - [191, 28.388] + - - [256, 1024, 1, 3552] + - [191, 28.415] + - - [256, 1024, 1, 2852] + - [191, 28.122] + - - [1024, 256, 1, 2852] + - [215, 27.964] + - - [256, 512, 1, 10752] + - [213, 24.124] + - - [256, 1024, 1, 3800] + - [183, 28.207] + - - [256, 512, 1, 10560] + - [191, 25.536] + - - [256, 1024, 1, 2992] + - [191, 28.293] + - - [256, 1024, 1, 2688] + - [206, 28.185] + - - [1024, 256, 1, 2688] + - [215, 28.176] + - - [256, 1024, 1, 2904] + - [191, 28.085] + - - [1024, 256, 1, 2904] + - [215, 27.905] + - - [256, 1024, 1, 2640] + - [191, 28.23] + - - [1024, 256, 1, 2640] + - [191, 28.149] + - - [1024, 256, 1, 4032] + - [191, 28.424] + - - [1024, 256, 1, 2992] + - [199, 28.18] + - - [256, 1024, 1, 3360] + - [191, 28.406] + - - [1024, 256, 1, 3360] + - [191, 28.361] + - - [1024, 256, 1, 3500] + - [199, 27.919] + - - [256, 1024, 1, 3500] + - [199, 28.081] + - - [1024, 256, 1, 3168] + - [191, 28.334] + - - [256, 1024, 1, 3168] + - [191, 28.401] + - - [256, 1024, 1, 3036] + - [191, 28.234] + - - [1024, 256, 1, 4200] + - [199, 28.013] + - - [1024, 256, 1, 3600] + - [199, 28.234] + - - [256, 1024, 1, 3600] + - [199, 28.311] + - - [256, 1024, 1, 2944] + - [191, 28.221] + - - [1024, 256, 1, 2944] + - [215, 28.198] + - - [1024, 256, 1, 3700] + - [191, 28.054] + - - [256, 1024, 1, 2352] + - [191, 28.248] + - - [1024, 256, 1, 2352] + - [191, 28.104] + - - [256, 1024, 1, 3700] + - [183, 28.171] + - - [256, 1024, 1, 2816] + - [199, 27.855] + - - [256, 512, 1, 11408] + - [191, 24.787] + - - [1024, 256, 1, 3036] + - [191, 28.09] + - - [1024, 256, 1, 3264] + - [191, 28.343] + - - [256, 1024, 1, 3264] + - [191, 28.352] + - - [1024, 256, 1, 3864] + - [199, 27.991] + - - [256, 1024, 1, 4032] + - [215, 28.428] + - - [1024, 256, 1, 3128] + - [199, 27.941] + - - [256, 1024, 1, 3128] + - [183, 28.149] + - - [256, 1024, 1, 3200] + - [191, 28.252] + - - [256, 512, 1, 11616] + - [206, 25.518] + - - [1024, 256, 1, 4000] + - [199, 28.41] + - - [256, 1024, 1, 2520] + - [191, 28.122] + - - [1024, 256, 1, 2520] + - [199, 27.864] + - - [256, 1024, 1, 2976] + - [191, 28.361] + - - [256, 1024, 1, 2400] + - [191, 28.329] + - - [1024, 256, 1, 2400] + - [191, 28.239] + - - [1024, 256, 1, 3696] + - [191, 28.239] + - - [1024, 256, 1, 3900] + - [191, 28.203] + - - [1024, 256, 1, 3772] + - [191, 28.158] + - - [256, 1024, 1, 3696] + - [183, 28.325] + - - [256, 1024, 1, 2728] + - [191, 28.126] + - - [1024, 256, 1, 2728] + - [191, 27.878] + - - [1024, 256, 1, 2480] + - [191, 28.117] + - - [256, 1024, 1, 2480] + - [191, 28.275] + - - [1024, 256, 1, 2880] + - [191, 28.307] + - - [512, 256, 1, 3220] + - [172, 23.623] + - - [256, 1024, 1, 2880] + - [191, 28.379] + - - [256, 1024, 1, 4200] + - [183, 28.216] + - - [1024, 256, 1, 3648] + - [191, 28.374] + - - [1024, 256, 1, 3312] + - [191, 28.203] + - - [256, 1024, 1, 3648] + - [191, 28.406] + - - [1024, 256, 1, 3300] + - [215, 28.031] + - - [1024, 256, 1, 3528] + - [199, 27.982] + - - [256, 1024, 1, 2604] + - [191, 28.054] + - - [1024, 256, 1, 2604] + - [191, 27.828] + - - [512, 256, 1, 11408] + - [206, 24.778] + - - [256, 1024, 1, 3312] + - [191, 28.302] + - - [256, 1024, 1, 3300] + - [183, 28.081] + - - [256, 1024, 1, 3528] + - [183, 28.167] + - - [1024, 256, 1, 2976] + - [191, 28.325] + - - [1024, 256, 1, 2760] + - [191, 27.891] + - - [512, 256, 1, 3800] + - [172, 23.799] + - - [256, 1024, 1, 2760] + - [191, 28.185] + - - [1024, 256, 1, 2160] + - [191, 28.063] + - - [256, 1024, 1, 2160] + - [191, 28.239] + - - [512, 256, 1, 11616] + - [206, 25.536] + - - [512, 256, 1, 2852] + - [172, 23.547] + - - [256, 1024, 1, 3864] + - [199, 28.153] + - - [512, 256, 1, 2640] + - [191, 24.16] + - - [256, 1024, 1, 4000] + - [199, 28.442] + - - [512, 256, 1, 2904] + - [172, 23.673] + - - [256, 1024, 1, 3900] + - [183, 28.261] + - - [512, 256, 1, 2688] + - [191, 24.314] + - - [256, 1024, 1, 3772] + - [183, 28.252] + - - [512, 256, 1, 3400] + - [172, 23.759] + - - [512, 256, 1, 3456] + - [191, 24.377] + - - [512, 256, 1, 3552] + - [191, 25.108] + - - [29000, 35, 1, 2560] + - [215, 16.793] + - - [29000, 36, 1, 2560] + - [199, 17.325] + - - [29000, 39, 1, 2560] + - [199, 18.733] + - - [29000, 40, 1, 2560] + - [183, 19.161] + - - [29000, 42, 1, 2560] + - [199, 20.113] + - - [29000, 43, 1, 2560] + - [215, 20.619] + - - [29000, 44, 1, 2560] + - [199, 21.079] + - - [29000, 46, 1, 2560] + - [215, 22.031] + - - [29000, 48, 1, 2560] + - [183, 22.915] + - - [29000, 49, 1, 2560] + - [183, 23.479] + - - [29000, 50, 1, 2560] + - [199, 23.853] + - - [29000, 51, 1, 2560] + - [215, 24.359] + - - [29000, 53, 1, 2560] + - [199, 25.324] + - - [29000, 54, 1, 2560] + - [215, 25.721] + - - [29000, 55, 1, 2560] + - [215, 26.177] + - - [29000, 56, 1, 2560] + - [215, 26.655] + - - [29000, 57, 1, 2560] + - [199, 27.129] + - - [29000, 58, 1, 2560] + - [199, 27.612] + - - [29000, 59, 1, 2560] + - [183, 28.09] + - - [29000, 61, 1, 2560] + - [183, 28.938] + - - [29000, 63, 1, 2560] + - [199, 29.94] + - - [288, 64, 1, 21609] + - [232, 14.812] + - - [32, 32, 36, 43808] + - [223, 13.869] + - - [32, 32, 64, 40000] + - [234, 13.224] + - - [32, 32, 49, 115200] + - [239, 13.765] + - - [32, 32, 36, 175232] + - [231, 14.298] + - - [32, 32, 49, 57600] + - [239, 13.58] + - - [32, 32, 36, 87616] + - [223, 14.361] + - - [32, 32, 64, 80000] + - [228, 13.165] + - - [256, 128, 1, 13600] + - [219, 21.273] + - - [256, 128, 1, 12880] + - [219, 21.246] + - - [128, 512, 1, 15200] + - [237, 26.389] + - - [512, 128, 1, 15200] + - [237, 26.294] + - - [128, 512, 1, 11408] + - [224, 25.924] + - - [256, 128, 1, 13824] + - [219, 20.713] + - - [128, 512, 1, 11616] + - [224, 25.924] + - - [256, 128, 1, 14208] + - [233, 21.115] + - - [128, 512, 1, 14208] + - [220, 26.15] + - - [256, 128, 1, 15200] + - [219, 21.543] + - - [512, 128, 1, 11408] + - [230, 25.852] + - - [512, 128, 1, 16800] + - [230, 26.439] + - - [128, 512, 1, 11264] + - [230, 25.297] + - - [512, 128, 1, 11616] + - [235, 25.807] + - - [512, 128, 1, 16128] + - [237, 26.236] + - - [512, 128, 1, 11968] + - [237, 25.861] + - - [128, 512, 1, 11968] + - [224, 26.015] + - - [512, 128, 1, 12288] + - [237, 24.81] + - - [128, 512, 1, 12288] + - [230, 25.166] + - - [128, 512, 1, 12672] + - [220, 25.965] + - - [512, 128, 1, 11776] + - [237, 25.618] + - - [512, 128, 1, 12144] + - [226, 25.942] + - - [512, 128, 1, 11264] + - [230, 25.18] + - - [128, 512, 1, 12144] + - [224, 26.028] + - - [512, 128, 1, 12672] + - [230, 25.762] + - - [128, 512, 1, 12512] + - [230, 26.073] + - - [128, 512, 1, 11776] + - [224, 25.636] + - - [256, 128, 1, 12288] + - [240, 18.304] + - - [40, 40, 1, 1909283] + - [222, 1.999] + - - [40, 40, 1, 3818566] + - [222, 1.999] + - - [30522, 20, 1, 1024] + - [196, 9.524] + - - [1760, 32, 1, 1760] + - [241, 12.998] + - - [3584, 4, 1, 1280] + - [242, 3.645] + - - [2944, 4, 1, 256] + - [245, 2.454] + - - [5056, 4, 1, 3328] + - [245, 4.25] + - - [1760, 16, 1, 1760] + - [196, 8.55] + - - [2368, 4, 1, 1280] + - [246, 2.621] + - - [6784, 4, 1, 1280] + - [245, 4.426] + - - [1856, 4, 1, 1280] + - [245, 2.301] + - - [2944, 4, 1, 128] + - [245, 1.908] + - - [3584, 4, 1, 128] + - [245, 2.202] + - - [8448, 16, 1, 2816] + - [243, 12.804] + - - [2368, 4, 1, 256] + - [245, 2.035] + - - [5888, 4, 1, 128] + - [245, 3.104] + - - [4288, 4, 1, 256] + - [245, 3.208] + - - [3584, 4, 1, 3328] + - [245, 3.979] + - - [2048, 16, 1, 2048] + - [180, 9.032] + - - [1408, 4, 1, 256] + - [246, 1.29] + - - [4288, 4, 1, 3328] + - [242, 4.399] + - - [2368, 4, 1, 3328] + - [242, 2.811] + - - [5056, 4, 1, 1280] + - [245, 4.237] + - - [3072, 16, 1, 1024] + - [196, 11.067] + - - [1408, 4, 1, 3328] + - [245, 1.863] + - - [6144, 16, 1, 2560] + - [243, 12.953] + - - [4096, 16, 1, 4096] + - [212, 12.267] + - - [1856, 4, 1, 256] + - [245, 1.674] + - - [6784, 4, 1, 128] + - [245, 3.212] + - - [4288, 4, 1, 128] + - [246, 2.54] + - - [5888, 4, 1, 3328] + - [242, 4.782] + - - [5056, 4, 1, 128] + - [245, 2.793] + - - [5888, 4, 1, 1280] + - [245, 4.512] + - - [2944, 4, 1, 3328] + - [242, 3.442] + - - [2368, 4, 1, 128] + - [246, 1.57] + - - [1856, 4, 1, 128] + - [245, 1.241] + - - [2560, 16, 1, 2560] + - [180, 9.98] + - - [7680, 16, 1, 2560] + - [247, 13.675] + - - [1408, 4, 1, 1280] + - [245, 1.751] + - - [6784, 4, 1, 256] + - [245, 3.212] + - - [1856, 4, 1, 3328] + - [242, 2.459] + - - [3584, 4, 1, 256] + - [245, 2.806] + - - [6784, 4, 1, 3328] + - [245, 4.575] + - - [2048, 32, 1, 2048] + - [247, 14.153] + - - [1408, 4, 1, 128] + - [242, 0.947] + - - [5056, 4, 1, 256] + - [246, 3.271] + - - [4288, 4, 1, 1280] + - [245, 4.065] + - - [4608, 16, 1, 1536] + - [196, 12.448] + - - [2944, 4, 1, 1280] + - [246, 3.181] + - - [5888, 4, 1, 256] + - [245, 3.515] + - - [2048, 32, 1, 1001] + - [243, 12.913] + - - [1536, 32, 1, 1001] + - [180, 10.954] + - - [1600, 1, 1, 1024] + - [246, 0.496] + - - [32768, 1, 1, 256] + - [244, 1.128] + - - [2048, 2, 1, 2048] + - [242, 1.29] + - - [2560, 4, 1, 2560] + - [246, 2.897] + - - [3456, 1, 1, 256] + - [245, 0.69] + - - [4096, 1, 1, 256] + - [245, 0.776] + - - [6912, 1, 1, 256] + - [246, 0.848] + - - [2048, 8, 1, 2048] + - [246, 4.999] + - - [2560, 2, 1, 2560] + - [242, 1.453] + - - [29000, 27, 1, 2560] + - [212, 12.073] + - - [4, 1856, 1, 3328] + - [194, 2.4] + - - [4, 1408, 1, 128] + - [178, 0.993] + - - [4, 2368, 1, 1280] + - [178, 2.603] + - - [4, 3584, 1, 128] + - [249, 2.143] + - - [4, 5888, 1, 3328] + - [249, 4.282] + - - [4, 1408, 1, 3328] + - [184, 1.904] + - - [4, 6784, 1, 3328] + - [250, 4.11] + - - [4, 4288, 1, 128] + - [249, 2.49] + - - [4, 6784, 1, 1280] + - [250, 4.079] + - - [4, 2944, 1, 3328] + - [178, 3.253] + - - [4, 5056, 1, 256] + - [249, 3.181] + - - [4, 5056, 1, 1280] + - [249, 3.781] + - - [4, 2368, 1, 3328] + - [210, 2.761] + - - [4, 1856, 1, 256] + - [178, 1.737] + - - [4, 2368, 1, 256] + - [194, 2.026] + - - [4, 2944, 1, 256] + - [250, 2.405] + - - [4, 4288, 1, 1280] + - [249, 3.871] + - - [4, 6784, 1, 128] + - [249, 3.014] + - - [4, 3584, 1, 1280] + - [249, 3.429] + - - [4, 5888, 1, 256] + - [248, 3.46] + - - [4, 6784, 1, 256] + - [250, 3.257] + - - [4, 1408, 1, 1280] + - [184, 1.814] + - - [4, 3584, 1, 256] + - [249, 2.725] + - - [4, 2944, 1, 1280] + - [178, 3.091] + - - [4, 1408, 1, 256] + - [178, 1.367] + - - [4, 4288, 1, 3328] + - [248, 4.124] + - - [4, 5888, 1, 1280] + - [248, 4.079] + - - [4, 1856, 1, 1280] + - [178, 2.292] + - - [4, 1856, 1, 128] + - [178, 1.286] + - - [4, 2944, 1, 128] + - [249, 1.877] + - - [4, 5056, 1, 3328] + - [250, 3.921] + - - [4, 5056, 1, 128] + - [250, 2.703] + - - [4, 4288, 1, 256] + - [250, 3.036] + - - [4, 3584, 1, 3328] + - [250, 3.668] + - - [4, 5888, 1, 128] + - [249, 2.906] + - - [4, 2368, 1, 128] + - [178, 1.588] + - - [32, 1600, 1, 512] + - [178, 12.118] + - - [2, 2048, 1, 1024] + - [178, 1.236] + - - [1, 4096, 1, 256] + - [249, 0.749] + - - [1, 6912, 1, 256] + - [210, 0.835] + - - [2, 2048, 1, 768] + - [178, 1.2] + - - [2, 4608, 1, 768] + - [250, 2.012] + - - [2, 4608, 1, 1024] + - [248, 2.003] + - - [1024, 16, 1, 500000] + - [238, 10.72] + - - [1024, 8, 1, 500000] + - [236, 5.364] + - - [512, 16, 1, 500000] + - [225, 8.505] + - - [512, 8, 1, 500000] + - [225, 4.241] + - - [64, 80, 1, 5329] + - [236, 5.017] + - - [576, 96, 1, 5329] + - [219, 17.853] + - - [288, 32, 1, 21609] + - [221, 10.44] + - - [576, 96, 1, 5041] + - [219, 18.602] + - - [27, 32, 1, 22201] + - [222, 1.335] + - - [160, 64, 1, 5329] + - [218, 7.634] + - - [448, 64, 1, 5329] + - [233, 16.026] + - - [147, 64, 1, 12544] + - [236, 8.279] + - - [147, 64, 1, 22500] + - [218, 9.213] + - - [576, 64, 1, 5625] + - [219, 20.339] + - - [256, 128, 1, 10752] + - [229, 19.333] + - - [256, 128, 1, 10560] + - [219, 20.668] + - - [256, 128, 1, 11408] + - [219, 20.957] + - - [256, 12, 1, 11408] + - [227, 3.912] + - - [256, 128, 1, 11616] + - [219, 20.943] + - - [256, 12, 1, 11616] + - [227, 3.939] + - - [256, 12, 1, 12288] + - [238, 3.939] + - - [11, 11, 1, 1909283] + - [222, 0.208] + - - [11, 11, 1, 3818566] + - [222, 0.208] + - - [768, 32, 1, 768] + - [178, 7.548] + - - [768, 64, 1, 768] + - [178, 12.904] + - - [1024, 80, 1, 1024] + - [179, 15.101] + - - [1024, 20, 1, 1024] + - [178, 6.334] + - - [768, 16, 1, 768] + - [210, 4.079] + - - [1024, 4, 1, 1024] + - [194, 1.381] + - - [1024, 6, 1, 1024] + - [194, 2.075] + - - [4, 704, 1, 1280] + - [194, 0.993] + - - [128, 64, 1, 256] + - [212, 2.139] + - - [128, 448, 1, 1280] + - [197, 13.089] + - - [64, 4, 1, 256] + - [169, 0.059] + - - [64, 704, 1, 128] + - [169, 7.886] + - - [448, 64, 1, 1280] + - [178, 9.375] + - - [128, 4, 1, 1280] + - [184, 0.194] + - - [64, 1024, 1, 1280] + - [204, 14.176] + - - [64, 704, 1, 1280] + - [178, 12.931] + - - [1024, 64, 1, 128] + - [182, 9.461] + - - [1024, 64, 1, 1280] + - [197, 13.883] + - - [4, 704, 1, 256] + - [210, 0.686] + - - [704, 4, 1, 1280] + - [210, 0.988] + - - [448, 128, 1, 128] + - [178, 8.762] + - - [256, 256, 1, 3328] + - [197, 15.191] + - - [4, 64, 1, 1280] + - [169, 0.09] + - - [64, 64, 1, 3328] + - [186, 1.651] + - - [128, 256, 1, 3328] + - [202, 11.388] + - - [64, 448, 1, 1280] + - [178, 9.538] + - - [448, 4, 1, 256] + - [184, 0.438] + - - [128, 4, 1, 128] + - [169, 0.09] + - - [256, 4, 1, 128] + - [169, 0.176] + - - [704, 64, 1, 3328] + - [178, 13.224] + - - [256, 64, 1, 1280] + - [178, 6.01] + - - [704, 64, 1, 128] + - [178, 7.792] + - - [1024, 4, 1, 256] + - [194, 0.993] + - - [256, 256, 1, 128] + - [178, 9.556] + - - [64, 256, 1, 128] + - [178, 3.154] + - - [704, 64, 1, 1280] + - [178, 12.998] + - - [128, 448, 1, 256] + - [178, 10.548] + - - [128, 256, 1, 1280] + - [178, 10.715] + - - [448, 64, 1, 3328] + - [187, 9.993] + - - [256, 128, 1, 128] + - [187, 5.879] + - - [64, 128, 1, 3328] + - [193, 3.298] + - - [128, 128, 1, 3328] + - [169, 6.222] + - - [256, 128, 1, 256] + - [178, 7.95] + - - [64, 448, 1, 3328] + - [178, 10.079] + - - [1024, 4, 1, 3328] + - [210, 1.529] + - - [4, 4, 1, 256] + - [169, 0.005] + - - [256, 64, 1, 256] + - [178, 4.133] + - - [256, 128, 1, 1280] + - [178, 10.9] + - - [128, 64, 1, 1280] + - [184, 3.036] + - - [4, 448, 1, 3328] + - [186, 0.717] + - - [64, 1024, 1, 256] + - [181, 11.469] + - - [256, 4, 1, 1280] + - [184, 0.383] + - - [64, 704, 1, 256] + - [178, 9.818] + - - [4, 704, 1, 128] + - [178, 0.492] + - - [448, 128, 1, 256] + - [210, 10.445] + - - [448, 64, 1, 128] + - [178, 5.342] + - - [4, 1024, 1, 1280] + - [194, 1.439] + - - [4, 448, 1, 1280] + - [184, 0.645] + - - [448, 4, 1, 1280] + - [184, 0.677] + - - [256, 256, 1, 256] + - [198, 11.541] + - - [256, 64, 1, 128] + - [212, 3.091] + - - [4, 1024, 1, 3328] + - [194, 1.534] + - - [64, 128, 1, 128] + - [212, 1.588] + - - [704, 4, 1, 128] + - [178, 0.492] + - - [256, 4, 1, 256] + - [178, 0.248] + - - [256, 4, 1, 3328] + - [184, 0.42] + - - [4, 256, 1, 256] + - [178, 0.248] + - - [4, 4, 1, 128] + - [169, 0.005] + - - [4, 128, 1, 256] + - [178, 0.131] + - - [64, 64, 1, 1280] + - [200, 1.507] + - - [448, 128, 1, 3328] + - [213, 13.364] + - - [64, 448, 1, 256] + - [178, 7.169] + - - [4, 448, 1, 128] + - [169, 0.316] + - - [64, 256, 1, 1280] + - [178, 5.825] + - - [64, 128, 1, 1280] + - [184, 3.005] + - - [64, 4, 1, 128] + - [169, 0.045] + - - [64, 64, 1, 256] + - [178, 1.114] + - - [4, 704, 1, 3328] + - [194, 1.06] + - - [4, 4, 1, 1280] + - [169, 0.005] + - - [128, 128, 1, 128] + - [171, 3.194] + - - [1024, 4, 1, 128] + - [178, 0.713] + - - [4, 64, 1, 128] + - [169, 0.045] + - - [64, 1024, 1, 128] + - [190, 9.556] + - - [128, 128, 1, 1280] + - [178, 5.883] + - - [128, 256, 1, 256] + - [178, 8.193] + - - [64, 128, 1, 256] + - [178, 2.202] + - - [1024, 4, 1, 1280] + - [178, 1.426] + - - [704, 64, 1, 256] + - [178, 9.93] + - - [128, 64, 1, 3328] + - [186, 3.294] + - - [448, 64, 1, 256] + - [178, 7.106] + - - [4, 256, 1, 128] + - [178, 0.18] + - - [1024, 64, 1, 256] + - [182, 11.401] + - - [4, 4, 1, 3328] + - [169, 0.005] + - - [704, 4, 1, 256] + - [169, 0.681] + - - [128, 4, 1, 3328] + - [184, 0.212] + - - [64, 1024, 1, 3328] + - [213, 15.119] + - - [448, 4, 1, 3328] + - [184, 0.735] + - - [4, 128, 1, 3328] + - [177, 0.203] + - - [704, 4, 1, 3328] + - [202, 1.056] + - - [448, 128, 1, 1280] + - [197, 13.089] + - - [1024, 64, 1, 3328] + - [189, 14.713] + - - [4, 1024, 1, 128] + - [178, 0.713] + - - [64, 256, 1, 3328] + - [169, 6.204] + - - [128, 256, 1, 128] + - [178, 5.987] + - - [128, 4, 1, 256] + - [169, 0.122] + - - [256, 256, 1, 1280] + - [197, 15.02] + - - [256, 128, 1, 3328] + - [187, 11.351] + - - [448, 4, 1, 128] + - [178, 0.316] + - - [4, 256, 1, 3328] + - [186, 0.411] + - - [4, 128, 1, 128] + - [169, 0.09] + - - [4, 256, 1, 1280] + - [184, 0.37] + - - [64, 4, 1, 3328] + - [175, 0.104] + - - [4, 64, 1, 3328] + - [177, 0.104] + - - [4, 1024, 1, 256] + - [178, 0.997] + - - [64, 256, 1, 256] + - [178, 4.358] + - - [4, 64, 1, 256] + - [178, 0.063] + - - [128, 448, 1, 128] + - [178, 8.807] + - - [64, 448, 1, 128] + - [178, 5.342] + - - [64, 704, 1, 3328] + - [178, 13.774] + - - [128, 448, 1, 3328] + - [213, 13.54] + - - [4, 448, 1, 256] + - [210, 0.438] + - - [4, 128, 1, 1280] + - [184, 0.185] + - - [128, 64, 1, 128] + - [178, 1.579] + - - [64, 64, 1, 128] + - [212, 0.803] + - - [64, 4, 1, 1280] + - [216, 0.099] + - - [256, 64, 1, 3328] + - [169, 6.217] + - - [128, 128, 1, 256] + - [196, 4.264] + - - [256, 64, 1, 3136] + - [187, 6.199] + - - [64, 200, 1, 1024] + - [178, 4.458] + - - [32, 512, 1, 1024] + - [178, 5.707] + - - [1, 512, 1, 1024] + - [184, 0.18] + - - [128, 512, 1, 2048] + - [197, 15.123] + - - [64, 256, 1, 1024] + - [178, 5.725] + - - [1, 200, 1, 1024] + - [184, 0.072] + - - [128, 512, 1, 1024] + - [213, 14.726] + - - [32, 256, 1, 2048] + - [186, 3.172] + - - [32, 256, 1, 512] + - [178, 2.594] + - - [256, 200, 1, 1024] + - [178, 13.364] + - - [1, 256, 1, 2048] + - [186, 0.099] + - - [32, 200, 1, 2048] + - [184, 2.454] + - - [128, 200, 1, 1024] + - [178, 8.387] + - - [128, 256, 1, 2048] + - [178, 11.121] + - - [64, 1024, 1, 1024] + - [179, 14.298] + - - [1, 512, 1, 2048] + - [184, 0.194] + - - [128, 256, 1, 512] + - [178, 9.533] + - - [128, 200, 1, 2048] + - [210, 8.906] + - - [64, 200, 1, 512] + - [178, 4.006] + - - [1, 256, 1, 1024] + - [178, 0.09] + - - [1, 1024, 1, 1024] + - [178, 0.352] + - - [256, 256, 1, 2048] + - [197, 15.146] + - - [128, 256, 1, 1024] + - [178, 10.512] + - - [1, 256, 1, 4096] + - [177, 0.104] + - - [32, 512, 1, 512] + - [178, 5.157] + - - [64, 200, 1, 2048] + - [210, 4.733] + - - [1, 200, 1, 2048] + - [177, 0.077] + - - [1, 512, 1, 4096] + - [186, 0.208] + - - [256, 256, 1, 1024] + - [213, 14.902] + - - [64, 256, 1, 2048] + - [194, 6.068] + - - [1, 200, 1, 4096] + - [177, 0.081] + - - [32, 256, 1, 1024] + - [200, 2.928] + - - [32, 200, 1, 1024] + - [200, 2.301] + - - [32, 512, 1, 2048] + - [194, 6.037] + - - [128, 200, 1, 512] + - [210, 7.467] + - - [64, 1024, 1, 2048] + - [179, 14.771] + - - [1, 1024, 1, 2048] + - [178, 0.374] + - - [32, 1024, 1, 512] + - [178, 9.163] + - - [64, 1024, 1, 512] + - [179, 13.098] + - - [1, 1024, 1, 4096] + - [178, 0.388] + - - [64, 256, 1, 512] + - [178, 5.184] + - - [256, 200, 1, 512] + - [178, 12.398] + - - [32, 1024, 1, 1024] + - [178, 10.052] + - - [32, 200, 1, 512] + - [178, 1.994] + - - [256, 256, 1, 512] + - [197, 13.661] + - - [128, 512, 1, 512] + - [213, 13.661] + - - [256, 200, 1, 2048] + - [178, 14.045] + - - [64, 512, 1, 2048] + - [178, 11.036] + - - [32, 1024, 1, 2048] + - [210, 10.661] + - - [256, 64, 1, 1225] + - [207, 5.549] + - - [384, 64, 1, 1225] + - [175, 8.311] + - - [288, 64, 1, 1225] + - [207, 6.235] + - - [384, 96, 1, 1225] + - [175, 10.557] + - - [11, 11, 5456, 64] + - [202, 7.422] + - - [14, 14, 4368, 64] + - [202, 10.557] + - - [23, 23, 2720, 64] + - [201, 8.978] + - - [13, 13, 4672, 64] + - [169, 9.619] + - - [29, 29, 2176, 64] + - [201, 13.783] + - - [12, 12, 5040, 64] + - [194, 8.653] + - - [27, 27, 2336, 64] + - [217, 12.227] + - - [10, 10, 5952, 64] + - [169, 6.037] + - - [7, 7, 8192, 64] + - [202, 3.072] + - - [16, 16, 3840, 64] + - [175, 12.637] + - - [17, 17, 3632, 64] + - [179, 7.16] + - - [9, 9, 6544, 64] + - [210, 5.121] + - - [8, 8, 7280, 64] + - [178, 4.11] + - - [21, 21, 2976, 64] + - [195, 8.929] + - - [19, 19, 3264, 64] + - [179, 8.879] + - - [25, 25, 2512, 64] + - [201, 10.485] + - - [18, 18, 3440, 64] + - [179, 7.945] + - - [15, 15, 4096, 64] + - [216, 11.108] + - - [2, 16, 1, 768] + - [169, 0.009] + - - [2, 8, 1, 768] + - [169, 0.005] + - - [2, 64, 1, 768] + - [187, 0.045] + - - [256, 128, 1, 784] + - [169, 9.628] + - - [192, 48, 1, 1225] + - [175, 3.127] + - - [64, 256, 1, 3136] + - [169, 6.24] + - - [512, 144, 1, 196] + - [187, 11.767] + - - [400, 32, 1, 784] + - [175, 4.178] + - - [832, 48, 1, 49] + - [169, 3.803] + - - [192, 32, 1, 784] + - [184, 2.111] + - - [288, 48, 1, 1225] + - [207, 4.688] + - - [512, 112, 1, 196] + - [187, 9.867] + - - [528, 32, 1, 196] + - [200, 3.573] + - - [576, 64, 1, 3136] + - [187, 12.43] + - - [480, 64, 1, 196] + - [194, 6.118] + - - [192, 64, 1, 784] + - [175, 4.002] + - - [192, 32, 1, 1225] + - [177, 2.184] + - - [400, 48, 1, 196] + - [184, 4.043] + - - [480, 16, 1, 196] + - [184, 1.665] + - - [512, 64, 1, 196] + - [200, 6.14] + - - [800, 64, 1, 196] + - [194, 9.24] + - - [512, 128, 1, 784] + - [213, 14.293] + - - [256, 64, 1, 784] + - [192, 5.315] + - - [256, 48, 1, 1225] + - [175, 4.16] + - - [192, 16, 1, 784] + - [184, 1.051] + - - [576, 96, 1, 1225] + - [202, 12.637] + - - [512, 128, 1, 196] + - [187, 10.693] + - - [192, 96, 1, 784] + - [207, 5.96] + - - [192, 64, 1, 1225] + - [175, 4.164] + - - [512, 32, 1, 196] + - [184, 3.451] + - - [528, 128, 1, 196] + - [173, 11.027] + - - [128, 512, 1, 784] + - [213, 14.293] + - - [64, 64, 1, 3136] + - [177, 1.651] + - - [256, 32, 1, 784] + - [184, 2.865] + - - [480, 96, 1, 196] + - [178, 8.419] + - - [1024, 32, 1, 1001] + - [175, 9.204] + - - [18, 18, 648, 64] + - [170, 6.113] + - - [7, 7, 736, 64] + - [178, 2.075] + - - [8, 8, 264, 64] + - [187, 1.719] + - - [9, 9, 416, 64] + - [202, 2.766] + - - [10, 10, 448, 64] + - [202, 3.334] + - - [11, 11, 568, 64] + - [178, 4.53] + - - [12, 12, 480, 64] + - [202, 4.891] + - - [12, 12, 2520, 64] + - [178, 7.738] + - - [13, 13, 576, 64] + - [169, 6.217] + - - [13, 13, 2336, 64] + - [169, 8.608] + - - [14, 14, 704, 64] + - [169, 7.489] + - - [14, 14, 2184, 64] + - [187, 9.308] + - - [15, 15, 688, 64] + - [210, 8.369] + - - [15, 15, 2048, 64] + - [175, 9.885] + - - [16, 16, 712, 64] + - [178, 8.685] + - - [16, 16, 1920, 64] + - [184, 10.9] + - - [17, 17, 688, 64] + - [179, 5.586] + - - [17, 17, 1816, 64] + - [203, 6.659] + - - [18, 18, 1720, 64] + - [203, 7.422] + - - [19, 19, 680, 64] + - [179, 6.853] + - - [19, 19, 1632, 64] + - [195, 7.859] + - - [21, 21, 1472, 64] + - [188, 7.575] + - - [21, 21, 1488, 64] + - [179, 7.747] + - - [23, 23, 64, 64] + - [202, 3.569] + - - [23, 23, 1360, 64] + - [169, 8.265] + - - [25, 25, 176, 64] + - [179, 7.119] + - - [25, 25, 1256, 64] + - [210, 9.614] + - - [26, 26, 56, 64] + - [173, 4.02] + - - [26, 27, 56, 64] + - [173, 4.142] + - - [27, 27, 56, 64] + - [173, 4.336] + - - [27, 27, 1168, 64] + - [217, 11.103] + - - [29, 29, 136, 64] + - [173, 8.053] + - - [29, 29, 1088, 64] + - [201, 12.849] + - - [256, 1, 1, 4] + - [169, 0.005] + - - [2, 1, 1, 1024] + - [192, 0.001] + - - [1024, 1, 1, 1024] + - [178, 0.347] + - - [2, 6, 1, 1024] + - [169, 0.005] + - - [2, 8, 1, 1024] + - [169, 0.005] + - - [14, 14, 1, 64] + - [169, 0.032] + - - [15, 14, 1, 64] + - [169, 0.036] + - - [15, 15, 1, 64] + - [187, 0.041] + - - [17, 15, 1, 64] + - [171, 0.041] + - - [17, 17, 1, 64] + - [211, 0.041] + - - [30, 30, 1, 64] + - [180, 0.126] + - - [30, 31, 1, 64] + - [171, 0.126] + - - [31, 31, 1, 64] + - [180, 0.131] + - - [1024, 32, 1, 1024] + - [178, 10.255] + - - [2, 32, 1, 1024] + - [169, 0.023] + - - [2, 4, 1, 1024] + - [169, 0.005] + - - [64, 512, 1, 512] + - [178, 9.484] + - - [64, 960, 1, 1024] + - [178, 13.621] + - - [200, 1, 1, 1024] + - [184, 0.072] + - - [512, 1, 1, 2048] + - [184, 0.203] + - - [64, 512, 1, 1024] + - [178, 10.499] + - - [3, 3, 512, 64] + - [169, 0.343] + - - [5, 5, 512, 64] + - [202, 0.934] + - - [9, 9, 512, 64] + - [178, 2.878] + - - [128, 256, 1, 1444] + - [175, 9.538] + - - [256, 128, 1, 25] + - [173, 1.985] + - - [256, 128, 1, 9] + - [173, 0.83] + - - [256, 256, 1, 1444] + - [197, 14.424] + - - [512, 128, 1, 100] + - [202, 8.595] + - - [64, 128, 1, 1444] + - [177, 3.0] + - - [1024, 77, 1, 1024] + - [179, 14.663] + - - [2, 10, 1, 1024] + - [175, 0.009] + - - [1024, 10, 1, 1024] + - [178, 3.442] + - - [2, 39, 1, 1024] + - [169, 0.027] + - - [1024, 39, 1, 1024] + - [178, 10.688] + - - [2, 40, 1, 1024] + - [169, 0.027] + - - [1024, 40, 1, 1024] + - [178, 10.963] + - - [2, 41, 1, 1024] + - [169, 0.027] + - - [1024, 41, 1, 1024] + - [178, 11.239] + - - [2, 5, 1, 1024] + - [169, 0.005] + - - [1024, 5, 1, 1024] + - [210, 1.728] + - - [1024, 8, 1, 1024] + - [178, 2.761] + - - [2, 9, 1, 1024] + - [169, 0.005] + - - [1024, 9, 1, 1024] + - [178, 3.109] + - - [4, 4, 32768, 64] + - [187, 1.056] + - - [4, 4, 38400, 64] + - [194, 1.056] + - - [14, 14, 10880, 64] + - [202, 11.568] + - - [15, 14, 10880, 64] + - [202, 12.29] + - - [15, 15, 7680, 64] + - [175, 11.816] + - - [15, 15, 10880, 64] + - [169, 12.209] + - - [17, 15, 7680, 64] + - [187, 8.297] + - - [17, 17, 6144, 64] + - [170, 7.377] + - - [17, 17, 7680, 64] + - [188, 7.634] + - - [21, 17, 6144, 64] + - [188, 9.105] + - - [21, 21, 6144, 64] + - [211, 9.885] + - - [24, 24, 4736, 64] + - [208, 10.093] + - - [30, 30, 2048, 64] + - [176, 14.916] + - - [30, 31, 2048, 64] + - [201, 15.286] + - - [31, 31, 2048, 64] + - [185, 15.714] + - - [34, 24, 4736, 64] + - [172, 11.717] + - - [128, 128, 1, 64] + - [171, 2.093] + - - [2, 1024, 1, 1024] + - [178, 0.704] + - - [5, 5, 1, 64] + - [169, 0.005] + - - [33, 33, 1, 32] + - [169, 0.077] + - - [5, 5, 960, 64] + - [169, 1.187] + - - [27, 27, 32768, 128] + - [201, 10.612] + - - [960, 1, 1, 2048] + - [194, 0.338] + - - [2, 2, 1, 2048] + - [175, 0.002] + - - [1024, 16, 1, 1024] + - [178, 5.572] + - - [2, 16, 1, 1024] + - [169, 0.009] + - - [2, 4, 1, 2560] + - [169, 0.005] + - - [1024, 64, 1, 1024] + - [179, 13.788] + - - [2, 64, 1, 1024] + - [169, 0.045] + - - [864, 1, 1, 256] + - [178, 0.208] + - - [2, 80, 1, 1024] + - [171, 0.054] + - - [1024, 82, 1, 1024] + - [179, 15.371] + - - [2, 82, 1, 1024] + - [184, 0.059] + - - [1024, 12, 1, 1024] + - [178, 4.137] + - - [2, 12, 1, 1024] + - [169, 0.009] + - - [24, 24, 6816, 64] + - [217, 10.228] + - - [26, 26, 6272, 64] + - [185, 11.992] + - - [256, 128, 1, 3136] + - [169, 11.857] + - - [2, 128, 1, 1024] + - [169, 0.086] + - - [2, 96, 1, 1024] + - [184, 0.068] + - - [768, 12, 1, 768] + - [178, 3.014] + - - [768, 4, 1, 768] + - [194, 1.006] + - - [256, 80, 1, 784] + - [207, 6.623] + - - [256, 12, 1, 3800] + - [177, 1.236] + - - [256, 3, 1, 3800] + - [192, 0.32] + - - [256, 12, 1, 950] + - [200, 1.092] + - - [256, 3, 1, 950] + - [192, 0.275] + - - [256, 12, 1, 3220] + - [177, 1.223] + - - [256, 3, 1, 3220] + - [175, 0.316] + - - [256, 12, 1, 3072] + - [200, 1.236] + - - [256, 3, 1, 3072] + - [184, 0.311] + - - [256, 12, 1, 850] + - [200, 1.074] + - - [256, 3, 1, 850] + - [175, 0.271] + - - [256, 12, 1, 2852] + - [177, 1.214] + - - [256, 3, 1, 2852] + - [175, 0.311] + - - [256, 12, 1, 805] + - [184, 1.056] + - - [256, 3, 1, 805] + - [175, 0.266] + - - [256, 3, 1, 864] + - [175, 0.271] + - - [256, 3, 1, 768] + - [184, 0.262] + - - [256, 12, 1, 864] + - [184, 1.078] + - - [256, 12, 1, 768] + - [184, 1.051] + - - [256, 12, 1, 2904] + - [177, 1.214] + - - [256, 3, 1, 2904] + - [175, 0.311] + - - [256, 3, 1, 713] + - [192, 0.262] + - - [256, 12, 1, 888] + - [200, 1.083] + - - [256, 3, 1, 888] + - [175, 0.271] + - - [256, 12, 1, 713] + - [184, 1.024] + - - [256, 3, 1, 660] + - [175, 0.257] + - - [256, 3, 1, 672] + - [175, 0.257] + - - [256, 12, 1, 660] + - [200, 1.015] + - - [256, 3, 1, 726] + - [192, 0.262] + - - [256, 12, 1, 672] + - [184, 1.02] + - - [256, 3, 1, 247] + - [175, 0.18] + - - [256, 12, 1, 726] + - [200, 1.038] + - - [256, 3, 1, 216] + - [175, 0.171] + - - [256, 3, 1, 3400] + - [175, 0.316] + - - [256, 3, 1, 221] + - [175, 0.171] + - - [256, 12, 1, 3552] + - [177, 1.236] + - - [256, 3, 1, 3456] + - [175, 0.316] + - - [256, 3, 1, 204] + - [175, 0.162] + - - [256, 12, 1, 3400] + - [177, 1.232] + - - [256, 12, 1, 3456] + - [184, 1.241] + - - [256, 12, 1, 221] + - [200, 0.686] + - - [256, 3, 1, 3552] + - [175, 0.316] + - - [256, 3, 1, 228] + - [175, 0.176] + - - [256, 3, 1, 234] + - [175, 0.176] + - - [256, 12, 1, 234] + - [216, 0.704] + - - [81, 1024, 1, 1024] + - [211, 15.074] + - - [81, 1000, 1, 1024] + - [211, 14.816] + - - [256, 12, 1, 228] + - [184, 0.704] + - - [256, 3, 1, 252] + - [175, 0.185] + - - [256, 12, 1, 252] + - [200, 0.726] + - - [256, 12, 1, 247] + - [216, 0.726] + - - [1024, 6, 1, 2] + - [169, 0.036] + - - [2, 8, 1, 2048] + - [169, 0.005] + - - [2, 20, 1, 1024] + - [169, 0.014] + - - [2, 2, 1, 2560] + - [175, 0.002] + - - [128, 128, 3072, 64] + - [253, 31.591] + - - [256, 256, 1024, 64] + - [254, 43.561] + - - [256, 256, 1536, 64] + - [254, 39.958] + - - [256, 256, 2048, 64] + - [254, 38.636] + - - [256, 256, 3072, 64] + - [255, 37.164] + - - [384, 384, 12, 64] + - [256, 36.34] + - - [384, 384, 16, 64] + - [256, 45.116] + - - [384, 384, 24, 64] + - [257, 52.35] + - - [384, 384, 32, 64] + - [258, 57.344] + - - [384, 384, 48, 64] + - [259, 63.431] + - - [384, 384, 64, 64] + - [260, 67.82] + - - [384, 384, 96, 64] + - [261, 72.09] + - - [384, 384, 128, 64] + - [262, 74.986] + - - [384, 384, 256, 64] + - [264, 61.981] + - - [384, 384, 384, 64] + - [265, 55.199] + - - [384, 384, 512, 64] + - [266, 49.617] + - - [384, 384, 768, 64] + - [267, 45.794] + - - [384, 384, 1024, 64] + - [267, 43.823] + - - [384, 384, 1536, 64] + - [268, 40.978] + - - [384, 384, 2048, 64] + - [269, 40.902] + - - [384, 384, 3072, 64] + - [270, 40.916] + - - [512, 512, 384, 64] + - [265, 53.212] + - - [512, 512, 512, 64] + - [265, 50.284] + - - [512, 512, 768, 64] + - [271, 46.194] + - - [512, 512, 1024, 64] + - [272, 44.212] + - - [512, 512, 1536, 64] + - [273, 44.117] + - - [512, 512, 2048, 64] + - [272, 44.07] + - - [512, 512, 3072, 64] + - [274, 44.001] + - - [768, 128, 1, 768] + - [275, 13.903] + - - [768, 256, 1, 768] + - [276, 24.278] + - - [16, 16, 12, 64] + - [277, 0.123] + - - [16, 16, 16, 64] + - [278, 0.141] + - - [16, 16, 24, 64] + - [279, 0.211] + - - [16, 16, 32, 64] + - [280, 0.28] + - - [16, 16, 48, 64] + - [281, 0.416] + - - [16, 16, 64, 64] + - [279, 0.55] + - - [16, 16, 96, 64] + - [282, 0.816] + - - [16, 16, 128, 64] + - [283, 1.108] + - - [16, 16, 192, 64] + - [283, 1.573] + - - [16, 16, 256, 64] + - [283, 1.963] + - - [16, 16, 384, 64] + - [283, 2.683] + - - [16, 16, 512, 64] + - [283, 3.218] + - - [16, 16, 768, 64] + - [284, 3.846] + - - [16, 16, 1024, 64] + - [285, 4.243] + - - [16, 16, 1536, 64] + - [286, 5.064] + - - [16, 16, 2048, 64] + - [285, 5.595] + - - [16, 16, 3072, 64] + - [287, 6.372] + - - [32, 32, 12, 64] + - [283, 0.45] + - - [32, 32, 16, 64] + - [281, 0.603] + - - [32, 32, 24, 64] + - [283, 0.917] + - - [32, 32, 32, 64] + - [288, 1.229] + - - [32, 32, 48, 64] + - [283, 1.774] + - - [32, 32, 64, 64] + - [283, 2.33] + - - [32, 32, 96, 64] + - [283, 3.412] + - - [32, 32, 128, 64] + - [283, 4.262] + - - [32, 32, 192, 64] + - [288, 5.643] + - - [32, 32, 256, 64] + - [287, 6.687] + - - [32, 32, 384, 64] + - [289, 8.386] + - - [32, 32, 512, 64] + - [287, 9.533] + - - [32, 32, 768, 64] + - [290, 10.711] + - - [32, 32, 1024, 64] + - [290, 12.308] + - - [32, 32, 1536, 64] + - [291, 14.027] + - - [32, 32, 2048, 64] + - [291, 18.12] + - - [32, 32, 3072, 64] + - [287, 23.741] + - - [64, 64, 12, 64] + - [288, 2.211] + - - [64, 64, 16, 64] + - [279, 2.365] + - - [64, 64, 24, 64] + - [288, 3.487] + - - [64, 64, 32, 64] + - [278, 4.506] + - - [64, 64, 48, 64] + - [283, 6.212] + - - [64, 64, 64, 64] + - [292, 7.771] + - - [64, 64, 128, 64] + - [293, 11.92] + - - [64, 64, 192, 64] + - [294, 14.94] + - - [64, 64, 256, 64] + - [295, 16.654] + - - [64, 64, 384, 64] + - [296, 19.277] + - - [64, 64, 512, 64] + - [296, 25.296] + - - [64, 64, 1024, 64] + - [297, 30.247] + - - [64, 64, 1536, 64] + - [298, 40.514] + - - [64, 64, 2048, 64] + - [298, 42.155] + - - [64, 64, 3072, 64] + - [288, 23.036] + - - [64, 64, 1280, 64] + - [298, 37.621] + - - [128, 128, 12, 64] + - [299, 5.938] + - - [128, 128, 16, 64] + - [299, 7.022] + - - [128, 128, 24, 64] + - [299, 9.891] + - - [128, 128, 32, 64] + - [256, 12.308] + - - [128, 128, 48, 64] + - [299, 16.921] + - - [128, 128, 64, 64] + - [300, 20.025] + - - [128, 128, 768, 64] + - [301, 61.003] + - - [256, 256, 12, 64] + - [302, 22.984] + - - [256, 256, 16, 64] + - [302, 21.504] + - - [256, 256, 24, 64] + - [260, 26.497] + - - [256, 256, 32, 64] + - [260, 29.919] + - - [256, 256, 48, 64] + - [258, 35.817] + - - [256, 256, 64, 64] + - [258, 39.734] + - - [256, 256, 128, 64] + - [303, 62.622] + - - [256, 256, 256, 64] + - [303, 71.876] + - - [256, 256, 384, 64] + - [302, 57.883] + - - [256, 256, 512, 64] + - [302, 36.269] + - - [256, 256, 768, 64] + - [304, 34.792] + - - [512, 512, 12, 64] + - [257, 45.235] + - - [512, 512, 24, 64] + - [257, 61.291] + - - [512, 512, 32, 64] + - [257, 63.978] + - - [768, 1, 1, 768] + - [305, 0.162] + - - [768, 2, 1, 768] + - [306, 0.324] + - - [768, 8, 1, 768] + - [305, 1.287] + - - [1024, 2, 1, 1024] + - [307, 0.476] - null -- DeviceEfficiency diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB_GB.yaml new file mode 100644 index 000000000..9e99c492e --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi31/navi31_Cijk_Alik_Bljk_SB_GB.yaml @@ -0,0 +1,66118 @@ +- {MinimumRequiredVersion: 4.26.0} +- navi31 +- gfx1100 +- [Device 744c] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17279.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19284.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17921.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 1024, 1024] + - [51, 20009.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16681.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18346.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17576.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 1024, 1024] + - [51, 19860.0] + - - [30522, 320, 1, 768, 30522, 30522, 768, 768] + - [30, 15999.0] + - - [3072, 4096, 1, 768, 3072, 3072, 768, 768] + - [15, 19154.0] + - - [768, 4096, 1, 3072, 768, 768, 3072, 3072] + - [32, 17701.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [32, 17107.0] + - - [30522, 160, 1, 768, 30522, 30522, 768, 768] + - [29, 13210.0] + - - [30522, 640, 1, 768, 30522, 30522, 768, 768] + - [49, 19094.0] + - - [30522, 1280, 1, 768, 30522, 30522, 768, 768] + - [43, 19560.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16988.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] + - [32, 17488.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] + - [15, 17668.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 18520.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] + - [32, 19755.0] + - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] + - [13, 15564.0] + - - [30522, 160, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 12911.0] + - - [128, 128, 512, 64, 128, 128, 64, 64] + - [38, 13823.0] + - - [512, 512, 64, 64, 512, 512, 64, 64] + - [38, 17330.0] + - - [256, 256, 192, 64, 256, 256, 64, 64] + - [1, 16536.0] + - - [256, 256, 96, 64, 256, 256, 64, 64] + - [5, 14432.0] + - - [128, 128, 384, 64, 128, 128, 64, 64] + - [21, 13377.0] + - - [128, 128, 96, 64, 128, 128, 64, 64] + - [19, 10663.0] + - - [512, 512, 16, 64, 512, 512, 64, 64] + - [0, 14432.0] + - - [512, 512, 96, 64, 512, 512, 64, 64] + - [38, 17583.0] + - - [512, 512, 128, 64, 512, 512, 64, 64] + - [0, 14851.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [15, 19429.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [30, 17944.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [30, 17573.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [49, 19309.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [13, 17422.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 18303.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [27, 14603.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [13, 19450.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [13, 16113.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 128] + - [49, 15809.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 19188.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [13, 18305.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 128] + - [22, 17714.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [49, 17033.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [51, 19922.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 18276.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [29, 13906.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 16208.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 19854.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [13, 19078.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [10, 12795.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [37, 19494.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 128] + - [27, 13996.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 16794.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [7, 19479.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [27, 12989.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [13, 17872.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 19081.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [49, 16253.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [29, 14694.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [10, 13821.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [30, 17605.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [13, 17093.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [43, 17671.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [47, 15569.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [32, 19877.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [30, 18533.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 128] + - [22, 17511.0] + - - [704, 5056, 1, 128, 704, 704, 128, 128] + - [10, 14264.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 17590.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [41, 19751.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 17797.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [30, 18979.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [48, 14404.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 19606.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [49, 18746.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 15951.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 128] + - [46, 15451.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [13, 18269.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 16629.0] + - - [448, 5888, 1, 128, 448, 448, 128, 128] + - [25, 13046.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 18157.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [15, 13161.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [13, 19175.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [12, 13229.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [30, 16900.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 128] + - [26, 17233.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 16468.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [29, 14821.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [30, 17486.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [49, 18623.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [15, 15560.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 128] + - [41, 18830.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [32, 17309.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 19450.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 19735.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 19579.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [13, 15952.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [30, 16276.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [10, 13451.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 128] + - [46, 16088.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [24, 13495.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 17612.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 17484.0] + - - [704, 2944, 1, 128, 704, 704, 128, 128] + - [10, 13535.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [29, 14835.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [49, 15924.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [30, 15903.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 128] + - [9, 16793.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 19075.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [49, 19105.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 18127.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 18621.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 19136.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 19507.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [30, 19018.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 18764.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [13, 15404.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [24, 20044.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [30, 18051.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [15, 20300.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 19434.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [49, 18381.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 17901.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 16732.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 128] + - [41, 18511.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [32, 17210.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 128] + - [5, 16780.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 128] + - [46, 16692.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 19524.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [13, 18280.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [13, 16324.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 128] + - [41, 16343.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [48, 14298.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [32, 17577.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [49, 18616.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [49, 19341.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 18099.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [30, 15948.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [13, 15955.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [30, 16050.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 18700.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 17881.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [30, 16892.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 128] + - [12, 13335.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [32, 20160.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [49, 17334.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [32, 20075.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 18407.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [15, 14857.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [32, 16431.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [7, 19593.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 128] + - [46, 18203.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 128] + - [26, 16503.0] + - - [448, 3584, 1, 128, 448, 448, 128, 128] + - [8, 11624.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 19364.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 128] + - [22, 17439.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [13, 19225.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 128] + - [27, 13030.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 128] + - [12, 12138.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 19408.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 128] + - [41, 16790.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 128] + - [45, 14309.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [13, 18137.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [15, 19225.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 19729.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [32, 19671.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [47, 14391.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 128] + - [9, 16533.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 19122.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [24, 19594.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [13, 18042.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 17514.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [13, 17946.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [14, 16803.0] + - - [256, 5056, 1, 128, 256, 256, 128, 128] + - [8, 12327.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [13, 17217.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 18348.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [47, 14282.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [7, 20329.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 128] + - [10, 15974.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [51, 18842.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [15, 18430.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 14825.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 13527.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [30, 17931.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 128] + - [9, 17656.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [43, 15350.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [7, 19335.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 15568.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [32, 12817.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19620.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 128] + - [47, 16441.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 128] + - [46, 17958.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 20111.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [13, 17798.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [30, 16818.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 19778.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [27, 12919.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [32, 20012.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 13962.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [13, 19712.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [10, 13072.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 18715.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 128] + - [47, 13702.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [30, 18408.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 128] + - [49, 18598.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 128] + - [47, 13264.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 18364.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [49, 18107.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [13, 17975.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 128] + - [22, 18128.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 128] + - [12, 14222.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 18427.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [30, 19256.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 128] + - [27, 16727.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [30, 16540.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [49, 18107.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 128] + - [13, 18611.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [13, 18786.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [7, 20434.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [48, 12797.0] + - - [448, 4288, 1, 128, 448, 448, 128, 128] + - [8, 12356.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [10, 15611.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 128] + - [27, 16020.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [32, 17201.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [48, 13197.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [49, 18940.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [30, 18504.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 17671.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [30, 17052.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [24, 19253.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 128] + - [46, 17138.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 19157.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [30, 19052.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 128] + - [27, 16095.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [30, 14641.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 128] + - [41, 18800.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [30, 17566.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 19563.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [18, 19102.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [12, 13799.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 17631.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [30, 18624.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 19925.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [32, 20413.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 20361.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 128] + - [49, 18805.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 18205.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [30, 18091.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 18236.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 18324.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [30, 19467.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [13, 19225.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [13, 15971.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 128] + - [46, 14939.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [49, 17707.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [15, 16591.0] + - - [704, 6784, 1, 128, 704, 704, 128, 128] + - [27, 15199.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 15617.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [49, 18027.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 128] + - [22, 18861.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 14238.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [15, 13906.0] + - - [256, 5888, 1, 128, 256, 256, 128, 128] + - [27, 12897.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [30, 17703.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 19081.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 15209.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [49, 17330.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 128] + - [37, 18026.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [32, 17567.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 128] + - [12, 13593.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 20199.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 128] + - [26, 18110.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 16601.0] + - - [448, 5056, 1, 128, 448, 448, 128, 128] + - [8, 12443.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [13, 18445.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 128] + - [46, 16622.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 128] + - [22, 18266.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [30, 17779.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [7, 20422.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [51, 20104.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [13, 19161.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 19167.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [48, 13294.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [24, 19984.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 20033.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [12, 13034.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [24, 19624.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [15, 19749.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 128] + - [41, 18368.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 128] + - [8, 14773.0] + - - [704, 3584, 1, 128, 704, 704, 128, 128] + - [47, 13434.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 14414.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 18578.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 128] + - [13, 17446.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [15, 19678.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [13, 19232.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [30, 18506.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [51, 16546.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 128] + - [41, 18240.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [29, 14424.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [30, 18379.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [49, 15693.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [13, 19636.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [30, 17403.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 15827.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 128] + - [46, 18400.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 17092.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [30, 19000.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 19789.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 18044.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [49, 17054.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [49, 18373.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 128] + - [8, 14735.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 128] + - [8, 13530.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 128] + - [27, 16003.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 128] + - [5, 17863.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 19890.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 128] + - [46, 17088.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 19669.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [15, 17119.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 18808.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 17775.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 128] + - [41, 17897.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [30, 17829.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [15, 19454.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [10, 14592.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 128] + - [47, 15354.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 19213.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 17602.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 128] + - [25, 12781.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 17371.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [30, 16874.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [49, 17350.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [13, 15953.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [51, 20212.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [43, 19396.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 17344.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [13, 19090.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [30, 18103.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [32, 19301.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 128] + - [47, 15229.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [30, 18212.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [7, 19769.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [49, 18265.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [30, 18388.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [13, 18707.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [13, 16981.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [49, 16110.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [49, 19209.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [15, 17542.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [51, 16953.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [51, 17263.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 17907.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 20119.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 18278.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [30, 18496.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [49, 15305.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 128] + - [26, 17552.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [49, 17420.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [49, 16028.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18950.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 14980.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [49, 18266.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 19283.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [30, 19387.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [26, 14490.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [49, 18766.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [12, 14740.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [13, 18915.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 128] + - [47, 13617.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [7, 19398.0] + - - [448, 6784, 1, 128, 448, 448, 128, 128] + - [47, 13508.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [30, 19139.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 128] + - [41, 17813.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [51, 18632.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 128] + - [12, 12597.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 20309.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [13, 16522.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [32, 18573.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [12, 14033.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 17751.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [15, 16906.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [43, 15335.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 19082.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [24, 20207.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [15, 19378.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [13, 16839.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [30, 15519.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [29, 14248.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 128] + - [46, 15583.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 128] + - [41, 17607.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [15, 18898.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [49, 17032.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 128] + - [5, 17735.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 19214.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 128] + - [27, 12225.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [15, 19841.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [47, 14112.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 18651.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 18635.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 15064.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [49, 17905.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [51, 14513.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [30, 15457.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [27, 14297.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [32, 17162.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 128] + - [46, 17615.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 16214.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 128] + - [22, 17900.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [49, 18116.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [49, 18707.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 128] + - [46, 18382.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 19652.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 128] + - [26, 15843.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 128] + - [27, 15845.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 128] + - [21, 13874.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [30, 14147.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 128] + - [47, 15421.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [30, 14505.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [30, 15370.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 128] + - [27, 15272.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [49, 15401.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 18079.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [30, 18762.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [47, 13616.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 19036.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 128] + - [49, 16547.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 19932.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [32, 19139.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [32, 17240.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 14223.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 17858.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 128] + - [27, 15874.0] + - - [256, 6784, 1, 128, 256, 256, 128, 128] + - [27, 14510.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 128] + - [22, 18461.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 128] + - [22, 17787.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 128] + - [30, 16800.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [15, 18964.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [49, 17149.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 19037.0] + - - [704, 5888, 1, 128, 704, 704, 128, 128] + - [10, 14862.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 128] + - [46, 18660.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 19754.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 128] + - [47, 13134.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [13, 17603.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [13, 16168.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [26, 11818.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 15058.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [51, 17591.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [29, 15056.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [15, 19813.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [13, 19139.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 128] + - [22, 16807.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 16597.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 18619.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 19219.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [27, 14278.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [7, 20145.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 128] + - [27, 16710.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 128] + - [5, 17279.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [51, 16165.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [32, 13458.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [51, 15864.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [30, 15877.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 128] + - [13, 18168.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [27, 11921.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [15, 20126.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [13, 18735.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [37, 19942.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [32, 15100.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 16852.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 128] + - [8, 11363.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 128] + - [22, 17283.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [15, 20272.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [29, 13126.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 13677.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 128] + - [8, 13823.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 19636.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [13, 16451.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [49, 17751.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [13, 17412.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 19259.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [32, 19305.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [32, 20423.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [49, 17655.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [7, 20310.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [30, 15437.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 18073.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 128] + - [41, 18151.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 128] + - [27, 14518.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [15, 19542.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 19815.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 128] + - [46, 18324.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [17, 16952.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [49, 18437.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [30, 16189.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [29, 12570.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [15, 13466.0] + - - [704, 2368, 1, 128, 704, 704, 128, 128] + - [25, 12363.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 128] + - [9, 12384.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 128] + - [41, 16274.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 128] + - [46, 17838.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [43, 15855.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [49, 16962.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 19950.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 128] + - [26, 18546.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [13, 17817.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [15, 20027.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [12, 13398.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [13, 19345.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 128] + - [49, 18217.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 19923.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [7, 19727.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [49, 16092.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [32, 20251.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 128] + - [8, 13844.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [41, 15652.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 19692.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19810.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [30, 17812.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 128] + - [41, 18805.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [13, 18798.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [14, 13390.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [30, 16613.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 18298.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [13, 18343.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 16232.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 18215.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [29, 15049.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [51, 20087.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [15, 18942.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [13, 15085.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [32, 16613.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [49, 14967.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [30, 17028.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 17746.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [12, 12040.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [51, 19667.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [49, 18772.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [32, 16710.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 19429.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [15, 18650.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 18503.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [13, 16933.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 128] + - [49, 17255.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 18648.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [13, 18348.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [7, 20196.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [32, 19820.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [30, 16436.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [30, 16609.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 128] + - [9, 16584.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [18, 14696.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 128] + - [27, 14663.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 128] + - [47, 14473.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 128] + - [27, 15275.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [32, 18049.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [32, 18722.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 17866.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [15, 20366.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [30, 18750.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [32, 15815.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [13, 18727.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [13, 19497.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [13, 18714.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [51, 15430.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 128] + - [8, 15612.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [30, 18626.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [13, 16016.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [7, 19889.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [15, 14435.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 128] + - [25, 13939.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 15883.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 128] + - [49, 18065.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [50, 16612.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [30, 18706.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 17491.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [49, 14428.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 128] + - [49, 17601.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 19866.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [27, 11466.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 128] + - [41, 16453.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [49, 18257.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 16053.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [24, 20282.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [30, 17509.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [30, 15447.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [30, 16887.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 19752.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [32, 16844.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [13, 17748.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [31, 16599.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [24, 17032.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 128] + - [9, 17255.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [49, 19476.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [49, 14851.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [51, 17233.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 128] + - [47, 15177.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 128] + - [40, 12763.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [24, 20085.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [37, 19414.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [32, 18583.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [30, 17465.0] + - - [704, 4288, 1, 128, 704, 704, 128, 128] + - [27, 13741.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [30, 18496.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [47, 16028.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [29, 13585.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 128] + - [26, 16999.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 18743.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 19097.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [49, 14919.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [13, 19896.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [15, 16257.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [32, 20016.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 128] + - [27, 15441.0] + - - [4096, 512, 1, 32, 4096, 4096, 32, 32] + - [0, 9670.0] + - - [2048, 1024, 1, 1664, 2048, 2048, 1664, 1664] + - [24, 17403.0] + - - [4096, 512, 1, 1408, 4096, 4096, 1408, 1408] + - [24, 17225.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 1280, 1280] + - [30, 17018.0] + - - [2048, 1024, 1, 640, 2048, 2048, 640, 640] + - [43, 16761.0] + - - [4096, 1024, 1, 13312, 4096, 4096, 13312, 13312] + - [49, 16466.0] + - - [2048, 1024, 1, 13312, 2048, 2048, 13312, 13312] + - [13, 15939.0] + - - [2048, 1024, 1, 3584, 2048, 2048, 3584, 3584] + - [15, 17407.0] + - - [4096, 1024, 1, 1920, 4096, 4096, 1920, 1920] + - [46, 17318.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 12288, 12288] + - [32, 16165.0] + - - [4096, 1024, 1, 8320, 4096, 4096, 8320, 8320] + - [24, 17227.0] + - - [4096, 1024, 1, 15360, 4096, 4096, 15360, 15360] + - [49, 16042.0] + - - [4096, 512, 1, 3072, 4096, 4096, 3072, 3072] + - [15, 17341.0] + - - [4096, 512, 1, 13312, 4096, 4096, 13312, 13312] + - [13, 14645.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 3840, 3840] + - [49, 17132.0] + - - [2048, 1024, 1, 3200, 2048, 2048, 3200, 3200] + - [24, 17601.0] + - - [4096, 512, 1, 3840, 4096, 4096, 3840, 3840] + - [43, 17573.0] + - - [4096, 512, 1, 5632, 4096, 4096, 5632, 5632] + - [32, 17621.0] + - - [4096, 512, 1, 64, 4096, 4096, 64, 64] + - [0, 12662.0] + - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] + - [32, 16146.0] + - - [4096, 512, 1, 8192, 4096, 4096, 8192, 8192] + - [15, 16401.0] + - - [4096, 512, 1, 2304, 4096, 4096, 2304, 2304] + - [15, 17227.0] + - - [4096, 512, 1, 2816, 4096, 4096, 2816, 2816] + - [15, 17456.0] + - - [2048, 1024, 1, 7680, 2048, 2048, 7680, 7680] + - [15, 17642.0] + - - [4096, 512, 1, 1920, 4096, 4096, 1920, 1920] + - [24, 17384.0] + - - [4096, 1024, 1, 32, 4096, 4096, 32, 32] + - [8, 12005.0] + - - [4096, 512, 1, 16640, 4096, 4096, 16640, 16640] + - [51, 17690.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] + - [51, 16702.0] + - - [4096, 512, 1, 1792, 4096, 4096, 1792, 1792] + - [51, 17239.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 8192, 8192] + - [32, 15605.0] + - - [2048, 1024, 1, 4160, 2048, 2048, 4160, 4160] + - [43, 17593.0] + - - [4096, 512, 1, 10240, 4096, 4096, 10240, 10240] + - [51, 15771.0] + - - [4096, 512, 1, 512, 4096, 4096, 512, 512] + - [28, 15698.0] + - - [2048, 1024, 1, 6656, 2048, 2048, 6656, 6656] + - [15, 17600.0] + - - [2048, 1024, 1, 14336, 2048, 2048, 14336, 14336] + - [13, 15901.0] + - - [4096, 512, 1, 11264, 4096, 4096, 11264, 11264] + - [13, 15197.0] + - - [4096, 512, 1, 128, 4096, 4096, 128, 128] + - [27, 13489.0] + - - [4096, 512, 1, 768, 4096, 4096, 768, 768] + - [15, 16697.0] + - - [4096, 1024, 1, 11264, 4096, 4096, 11264, 11264] + - [49, 16050.0] + - - [4096, 1024, 1, 16640, 4096, 4096, 16640, 16640] + - [49, 17218.0] + - - [2048, 1024, 1, 5632, 2048, 2048, 5632, 5632] + - [15, 17537.0] + - - [4096, 512, 1, 12288, 4096, 4096, 12288, 12288] + - [14, 14291.0] + - - [4096, 1024, 1, 5632, 4096, 4096, 5632, 5632] + - [30, 17170.0] + - - [2048, 1024, 1, 10240, 2048, 2048, 10240, 10240] + - [15, 17600.0] + - - [4096, 1024, 1, 640, 4096, 4096, 640, 640] + - [46, 16891.0] + - - [2048, 1024, 1, 12288, 2048, 2048, 12288, 12288] + - [15, 16936.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 10240, 10240] + - [51, 16264.0] + - - [2048, 1024, 1, 4608, 2048, 2048, 4608, 4608] + - [15, 17481.0] + - - [4096, 512, 1, 3584, 4096, 4096, 3584, 3584] + - [15, 17475.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4608, 4608] + - [30, 17165.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 3328, 3328] + - [30, 17166.0] + - - [2048, 1024, 1, 9216, 2048, 2048, 9216, 9216] + - [15, 17661.0] + - - [2048, 1024, 1, 2304, 2048, 2048, 2304, 2304] + - [32, 17408.0] + - - [4096, 512, 1, 6144, 4096, 4096, 6144, 6144] + - [15, 17574.0] + - - [4096, 512, 1, 15360, 4096, 4096, 15360, 15360] + - [30, 14780.0] + - - [4096, 1024, 1, 7168, 4096, 4096, 7168, 7168] + - [15, 17186.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 9216, 9216] + - [32, 16138.0] + - - [4096, 1024, 1, 7680, 4096, 4096, 7680, 7680] + - [32, 17153.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 8192, 8192] + - [15, 17446.0] + - - [4096, 1024, 1, 64, 4096, 4096, 64, 64] + - [0, 14653.0] + - - [2048, 1024, 1, 1280, 2048, 2048, 1280, 1280] + - [24, 17199.0] + - - [2048, 1024, 1, 3328, 2048, 2048, 3328, 3328] + - [51, 17544.0] + - - [4096, 512, 1, 14336, 4096, 4096, 14336, 14336] + - [30, 15083.0] + - - [4096, 512, 1, 8320, 4096, 4096, 8320, 8320] + - [24, 17658.0] + - - [4096, 1024, 1, 6656, 4096, 4096, 6656, 6656] + - [30, 17186.0] + - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] + - [11, 14378.0] + - - [4096, 512, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 16912.0] + - - [4096, 1024, 1, 1536, 4096, 4096, 1536, 1536] + - [30, 17025.0] + - - [2048, 1024, 1, 32, 2048, 2048, 32, 32] + - [0, 9020.0] + - - [4096, 512, 1, 640, 4096, 4096, 640, 640] + - [15, 16599.0] + - - [4096, 512, 1, 16384, 4096, 4096, 16384, 16384] + - [51, 15218.0] + - - [4096, 1024, 1, 512, 4096, 4096, 512, 512] + - [30, 16443.0] + - - [2048, 1024, 1, 1152, 2048, 2048, 1152, 1152] + - [43, 17274.0] + - - [4096, 1024, 1, 2080, 4096, 4096, 2080, 2080] + - [37, 17786.0] + - - [4096, 1024, 1, 768, 4096, 4096, 768, 768] + - [30, 16881.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 2560, 2560] + - [30, 17073.0] + - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] + - [8, 12980.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 16384, 16384] + - [51, 15342.0] + - - [4096, 512, 1, 6656, 4096, 4096, 6656, 6656] + - [32, 17597.0] + - - [2048, 1024, 1, 128, 2048, 2048, 128, 128] + - [10, 13422.0] + - - [2048, 1024, 1, 2080, 2048, 2048, 2080, 2080] + - [43, 17485.0] + - - [2048, 1024, 1, 16640, 2048, 2048, 16640, 16640] + - [15, 17632.0] + - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] + - [15, 17333.0] + - - [4096, 1024, 1, 1408, 4096, 4096, 1408, 1408] + - [26, 17283.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 2048, 2048] + - [13, 17026.0] + - - [2048, 1024, 1, 2560, 2048, 2048, 2560, 2560] + - [51, 17414.0] + - - [4096, 1024, 1, 128, 4096, 4096, 128, 128] + - [9, 15322.0] + - - [4096, 1024, 1, 14336, 4096, 4096, 14336, 14336] + - [51, 16465.0] + - - [4096, 512, 1, 9216, 4096, 4096, 9216, 9216] + - [13, 16001.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [51, 17179.0] + - - [4096, 512, 1, 1536, 4096, 4096, 1536, 1536] + - [51, 17156.0] + - - [2048, 1024, 1, 16384, 2048, 2048, 16384, 16384] + - [32, 15760.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 16687.0] + - - [4096, 1024, 1, 1664, 4096, 4096, 1664, 1664] + - [26, 17389.0] + - - [4096, 512, 1, 384, 4096, 4096, 384, 384] + - [7, 15890.0] + - - [4096, 512, 1, 3328, 4096, 4096, 3328, 3328] + - [15, 17488.0] + - - [4096, 1024, 1, 256, 4096, 4096, 256, 256] + - [13, 16225.0] + - - [2048, 1024, 1, 7168, 2048, 2048, 7168, 7168] + - [15, 17647.0] + - - [2048, 1024, 1, 1536, 2048, 2048, 1536, 1536] + - [32, 17206.0] + - - [4096, 512, 1, 7168, 4096, 4096, 7168, 7168] + - [15, 17576.0] + - - [4096, 1024, 1, 896, 4096, 4096, 896, 896] + - [26, 17037.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 17091.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 6144, 6144] + - [15, 17624.0] + - - [4096, 512, 1, 4160, 4096, 4096, 4160, 4160] + - [7, 17625.0] + - - [4096, 512, 1, 2080, 4096, 4096, 2080, 2080] + - [24, 17439.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 5120, 5120] + - [15, 17146.0] + - - [2048, 1024, 1, 1920, 2048, 2048, 1920, 1920] + - [43, 17411.0] + - - [2048, 1024, 1, 15360, 2048, 2048, 15360, 15360] + - [13, 15384.0] + - - [4096, 1024, 1, 2816, 4096, 4096, 2816, 2816] + - [13, 17080.0] + - - [4096, 512, 1, 256, 4096, 4096, 256, 256] + - [30, 14971.0] + - - [2048, 1024, 1, 5120, 2048, 2048, 5120, 5120] + - [15, 17579.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 4096, 4096] + - [15, 17542.0] + - - [4096, 512, 1, 4608, 4096, 4096, 4608, 4608] + - [51, 17568.0] + - - [4096, 512, 1, 1664, 4096, 4096, 1664, 1664] + - [24, 17320.0] + - - [2048, 1024, 1, 896, 2048, 2048, 896, 896] + - [24, 17073.0] + - - [4096, 1024, 1, 4160, 4096, 4096, 4160, 4160] + - [37, 17823.0] + - - [2048, 1024, 1, 11264, 2048, 2048, 11264, 11264] + - [51, 17600.0] + - - [2048, 1024, 1, 384, 2048, 2048, 384, 384] + - [9, 15475.0] + - - [2048, 1024, 1, 3840, 2048, 2048, 3840, 3840] + - [43, 17609.0] + - - [4096, 512, 1, 1280, 4096, 4096, 1280, 1280] + - [43, 17093.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 1152, 1152] + - [26, 17270.0] + - - [2048, 1024, 1, 1408, 2048, 2048, 1408, 1408] + - [24, 17337.0] + - - [4096, 512, 1, 896, 4096, 4096, 896, 896] + - [43, 16987.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 3072, 3072] + - [13, 17131.0] + - - [2048, 1024, 1, 2816, 2048, 2048, 2816, 2816] + - [51, 17488.0] + - - [4096, 1024, 1, 1792, 4096, 4096, 1792, 1792] + - [13, 17069.0] + - - [4096, 512, 1, 1152, 4096, 4096, 1152, 1152] + - [24, 17117.0] + - - [4096, 512, 1, 7680, 4096, 4096, 7680, 7680] + - [51, 17615.0] + - - [4096, 1024, 1, 384, 4096, 4096, 384, 384] + - [26, 16580.0] + - - [2048, 1024, 1, 1792, 2048, 2048, 1792, 1792] + - [51, 17320.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 3584, 3584] + - [13, 17162.0] + - - [2048, 1024, 1, 768, 2048, 2048, 768, 768] + - [32, 16865.0] + - - [2048, 1024, 1, 8320, 2048, 2048, 8320, 8320] + - [24, 17714.0] + - - [4096, 512, 1, 2048, 4096, 4096, 2048, 2048] + - [15, 17296.0] + - - [4096, 512, 1, 2560, 4096, 4096, 2560, 2560] + - [51, 17400.0] + - - [4096, 1024, 1, 2304, 4096, 4096, 2304, 2304] + - [43, 17132.0] + - - [4096, 512, 1, 5120, 4096, 4096, 5120, 5120] + - [15, 17575.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 6144, 6144] + - [13, 17123.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16875.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18907.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16647.0] + - - [132, 134, 480, 64, 132, 132, 64, 64] + - [36, 6269.0] + - - [162, 162, 400, 64, 162, 162, 64, 64] + - [0, 8762.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19133.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18504.0] + - - [132, 135, 480, 64, 132, 132, 64, 64] + - [36, 6310.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17166.0] + - - [33708, 3681, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19573.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19312.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17451.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18779.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18837.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18748.0] + - - [159, 162, 400, 64, 159, 159, 64, 64] + - [0, 8692.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15579.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19126.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18421.0] + - - [33708, 3584, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19851.0] + - - [33708, 3640, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19364.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18597.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18867.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16706.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18677.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19074.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18442.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19331.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16153.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17820.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18346.0] + - - [33708, 3894, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19333.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16055.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16207.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18716.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19133.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19120.0] + - - [133, 133, 480, 64, 133, 133, 64, 64] + - [17, 6259.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16813.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17141.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16516.0] + - - [134, 134, 480, 64, 134, 134, 64, 64] + - [17, 6374.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16770.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18866.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18874.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15892.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16304.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18954.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19286.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15518.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18697.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17595.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18433.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18907.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18740.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18072.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19022.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16497.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 14942.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17128.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18671.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18778.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16835.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17001.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17967.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18927.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16697.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17305.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18953.0] + - - [232, 232, 272, 64, 232, 232, 64, 64] + - [42, 10547.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17214.0] + - - [228, 232, 272, 64, 228, 228, 64, 64] + - [6, 10020.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19186.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18822.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18005.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18864.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18797.0] + - - [33708, 3968, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19731.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15992.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17743.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18789.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18862.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16720.0] + - - [33708, 3944, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19587.0] + - - [135, 135, 480, 64, 135, 135, 64, 64] + - [45, 6394.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19264.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18942.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16813.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18835.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19222.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16113.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18989.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17275.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19154.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 16791.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19004.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17372.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18663.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19133.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15288.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19085.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16881.0] + - - [133, 135, 480, 64, 133, 133, 64, 64] + - [17, 6331.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18718.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17676.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19229.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19223.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18877.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16650.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18847.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15965.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16949.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17333.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18818.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19116.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17591.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18403.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16951.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15784.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16976.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18965.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15495.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18868.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19084.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18116.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18475.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19212.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16477.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17985.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18028.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18714.0] + - - [33708, 3996, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19510.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18440.0] + - - [33708, 3978, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19344.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18743.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18743.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19401.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18110.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18747.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17580.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18896.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18863.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17702.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17846.0] + - - [33708, 3939, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19552.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18539.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16950.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15764.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18888.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15344.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19389.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18719.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18927.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19247.0] + - - [33708, 4026, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19556.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18610.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18459.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16338.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17413.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17418.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17937.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18549.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18565.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19005.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16662.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16558.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19139.0] + - - [147, 147, 432, 64, 147, 147, 64, 64] + - [0, 7511.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18817.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15804.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15864.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17791.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18980.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19035.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19421.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18234.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18718.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18441.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18387.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18876.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18056.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19014.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17705.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15981.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18666.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18150.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18560.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16826.0] + - - [148, 148, 432, 64, 148, 148, 64, 64] + - [0, 7593.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16191.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18343.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16992.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15888.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18733.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19009.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18510.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18416.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19355.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18247.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16779.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18378.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16834.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 16873.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18828.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19138.0] + - - [33708, 3976, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19345.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16746.0] + - - [33708, 3910, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19431.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18671.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18837.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18874.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16349.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17562.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18745.0] + - - [193, 193, 320, 64, 193, 193, 64, 64] + - [37, 8610.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17953.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16598.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16835.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18013.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18478.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19332.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16251.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18458.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18959.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15824.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18978.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17846.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18634.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15737.0] + - - [33708, 3876, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19262.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18720.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18743.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18944.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19338.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18239.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16101.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18058.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18344.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19203.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18839.0] + - - [33708, 3975, 1, 1024, 33708, 33708, 1024, 1024] + - [32, 19329.0] + - - [148, 147, 432, 64, 148, 148, 64, 64] + - [36, 7511.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18488.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18901.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16781.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16805.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18877.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17112.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18840.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18947.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19045.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18221.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18412.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18710.0] + - - [143, 143, 432, 64, 143, 143, 64, 64] + - [0, 7149.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18941.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18929.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18465.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18962.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19084.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19214.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18287.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18504.0] + - - [177, 177, 352, 64, 177, 177, 64, 64] + - [17, 9318.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18905.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18152.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18868.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17006.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18943.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15377.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17045.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18599.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19134.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18973.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19290.0] + - - [33708, 4059, 1, 1024, 33708, 33708, 1024, 1024] + - [32, 19725.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16595.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16927.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15942.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16411.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18818.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18753.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18873.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16880.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16340.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18591.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 15533.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17021.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16844.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18420.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19305.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18702.0] + - - [33708, 3990, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19432.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16753.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18773.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19150.0] + - - [160, 159, 400, 64, 160, 160, 64, 64] + - [17, 8798.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16981.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17039.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18279.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17457.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18920.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19208.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19051.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15718.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18200.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17047.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 16196.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18776.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17005.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17009.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18959.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19268.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18920.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17568.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18488.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16581.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18307.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18940.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18929.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18496.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16901.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18557.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18723.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17042.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18985.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18808.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18932.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18770.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17856.0] + - - [33708, 3995, 1, 1024, 33708, 33708, 1024, 1024] + - [32, 19433.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18752.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19165.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18531.0] + - - [33708, 3796, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19636.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18739.0] + - - [33708, 3859, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19200.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19205.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18876.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18859.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16817.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18718.0] + - - [135, 134, 480, 64, 135, 135, 64, 64] + - [17, 6361.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16462.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19148.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17592.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18753.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18734.0] + - - [33708, 3840, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19902.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18915.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16311.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18880.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16468.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18285.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18727.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16579.0] + - - [228, 228, 272, 64, 228, 228, 64, 64] + - [4, 10209.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18774.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19017.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16851.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18589.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19267.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18244.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 16124.0] + - - [33708, 3870, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 19201.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18925.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16514.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18833.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19135.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18222.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18891.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19309.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18839.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19036.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18028.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17160.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18515.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18488.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19227.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17526.0] + - - [143, 148, 432, 64, 143, 143, 64, 64] + - [17, 7336.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16360.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18984.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 15749.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16649.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18883.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19212.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16836.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18720.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18586.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18261.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18437.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18974.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16809.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15836.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18681.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18746.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18548.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16897.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18685.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19334.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15909.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18762.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16367.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18836.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18820.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18937.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 15784.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18430.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18612.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18779.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16141.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18920.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18953.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18538.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18875.0] + - - [159, 159, 400, 64, 159, 159, 64, 64] + - [0, 8595.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15687.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19058.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17679.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18047.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16949.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17591.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18728.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [49, 17315.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18510.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18957.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19147.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15669.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19073.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19243.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18846.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17541.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19016.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19061.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18316.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18913.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17607.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17100.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16352.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18661.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19137.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16427.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19110.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18706.0] + - - [33708, 4005, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19497.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15764.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18255.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18507.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16693.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19217.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18946.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18351.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18995.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16274.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18731.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19027.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16133.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19365.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16500.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16178.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19103.0] + - - [132, 132, 480, 64, 132, 132, 64, 64] + - [0, 6152.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17344.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18711.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16590.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18555.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15756.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16922.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16355.0] + - - [33708, 3977, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19322.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19056.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18740.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18728.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18626.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18704.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19364.0] + - - [33708, 4030, 1, 1024, 33708, 33708, 1024, 1024] + - [32, 19577.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18220.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16628.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15985.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15212.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17899.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18951.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16084.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18764.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19155.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18707.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18571.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16660.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18706.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17725.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19077.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19298.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18741.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16302.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17435.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17577.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19102.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18762.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16949.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16972.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17611.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 15693.0] + - - [33708, 3822, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19800.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18903.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18137.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18104.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17072.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19217.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18339.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18929.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19385.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16407.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17886.0] + - - [33708, 3999, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19463.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17705.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18937.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16271.0] + - - [33708, 3925, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19480.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16191.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18905.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17361.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18843.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18776.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18587.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19104.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18187.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17767.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15795.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18229.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15865.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16856.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16519.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18927.0] + - - [33708, 3751, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19441.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18334.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18942.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19375.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18874.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19185.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17834.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16865.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17088.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19402.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19152.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16189.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18776.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16761.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15773.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19243.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19224.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19184.0] + - - [33708, 3906, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19420.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18594.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18904.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17934.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18458.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17566.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18671.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16303.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18029.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18962.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17825.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18066.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19227.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18225.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17299.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18658.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18722.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15872.0] + - - [33708, 3955, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19627.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19314.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17990.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16668.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18779.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17555.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18988.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19193.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18740.0] + - - [33708, 3969, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19328.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16479.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18762.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18858.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15583.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18942.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19189.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18809.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18680.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18925.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18724.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18800.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16671.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18883.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18877.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 17004.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17188.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18549.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18935.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16752.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16541.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17741.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18179.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17110.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17287.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19000.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18760.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18308.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18892.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16676.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18743.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15976.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16597.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18826.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18882.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19040.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16380.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17863.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19203.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18626.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15684.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18307.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18286.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18836.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18680.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18348.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18996.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16575.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 15458.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18718.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19140.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15514.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17769.0] + - - [160, 160, 400, 64, 160, 160, 64, 64] + - [36, 8868.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19356.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16817.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18484.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18150.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18531.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18733.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16275.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16354.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19140.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17656.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18819.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [49, 16856.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17146.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17524.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18789.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19110.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18411.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16035.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18118.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19214.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16789.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17089.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16869.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17237.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18702.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17353.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18787.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19123.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18926.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18542.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16593.0] + - - [33708, 4032, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19617.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17246.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18941.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19223.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17969.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15864.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18755.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16802.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18993.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19316.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18947.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18785.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15445.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 16827.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18831.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18818.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19328.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18579.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18488.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18697.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18768.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18172.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18818.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16775.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18853.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17681.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18418.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15696.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18329.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19047.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17319.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19124.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18776.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17810.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15824.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18659.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17084.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16725.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18804.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18998.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17530.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18863.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18571.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19208.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18983.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18827.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18600.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16175.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18865.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19055.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19022.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18676.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18599.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18279.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19103.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16562.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17559.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19063.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18310.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18766.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15054.0] + - - [33708, 3780, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19594.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19182.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19207.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18179.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18981.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18813.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17312.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18801.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19003.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18041.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18990.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19188.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17454.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17664.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18803.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19066.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18090.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18495.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19140.0] + - - [33708, 3956, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19638.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18677.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18818.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18335.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18990.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17404.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16754.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18751.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15669.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16860.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18732.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19382.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18811.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19165.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18788.0] + - - [33708, 3720, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19312.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16772.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17631.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18888.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19277.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18866.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17025.0] + - - [33708, 3860, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19192.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18908.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18637.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18455.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15932.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18909.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19284.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16191.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19142.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16087.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19043.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18740.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18073.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16760.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19018.0] + - - [33708, 4012, 1, 1024, 33708, 33708, 1024, 1024] + - [32, 19547.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18700.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15842.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17590.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17522.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18955.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15587.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19071.0] + - - [33708, 4050, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19676.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16672.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18798.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17613.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17541.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17335.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18319.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19076.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18978.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18710.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18023.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18877.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18636.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15670.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16194.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18871.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16310.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 15247.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19032.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18542.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15877.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18833.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 17440.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16402.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19013.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 18800.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18339.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18792.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19053.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16181.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18224.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19393.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17450.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18806.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17247.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18814.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19196.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16622.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18592.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18903.0] + - - [33708, 4020, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 19518.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17365.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18648.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 16882.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17519.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19265.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18656.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16530.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18732.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18392.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19310.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15427.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18744.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18731.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 15908.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18762.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19035.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16109.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19266.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17829.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18674.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18599.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19036.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15791.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18804.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16692.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18194.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19061.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19218.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18533.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18651.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17336.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18856.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18873.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18970.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16484.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18174.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18526.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 17076.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 16555.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18490.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18948.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18532.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17052.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18900.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16408.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18934.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16954.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 15357.0] + - - [33708, 3900, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 19335.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18998.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18949.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 16586.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18748.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18032.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18196.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18779.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 18863.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 17007.0] + - - [33708, 3942, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 19575.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19051.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17138.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19293.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18748.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18662.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16400.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17991.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17277.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18928.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18236.0] + - - [1024, 3712, 1, 36548, 1024, 1024, 36548, 36548] + - [1, 18114.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 17188.0] + - - [4032, 384, 1, 64, 4032, 4032, 64, 64] + - [0, 10564.0] + - - [1024, 2048, 1, 49, 1024, 1024, 49, 49] + - [8, 11317.0] + - - [4608, 512, 1, 49, 4608, 4608, 49, 49] + - [36, 11514.0] + - - [9216, 512, 1, 4096, 9216, 9216, 4096, 4096] + - [32, 18389.0] + - - [3456, 384, 1, 289, 3456, 3456, 289, 289] + - [36, 14100.0] + - - [3456, 384, 1, 169, 3456, 3456, 169, 169] + - [36, 13626.0] + - - [4096, 512, 1, 1001, 4096, 4096, 1001, 1001] + - [7, 16899.0] + - - [384, 448, 49, 512, 384, 384, 512, 512] + - [49, 16030.0] + - - [384, 448, 64, 256, 384, 384, 256, 256] + - [13, 15753.0] + - - [384, 448, 36, 256, 384, 384, 256, 256] + - [12, 15230.0] + - - [384, 448, 49, 256, 384, 384, 256, 256] + - [29, 15637.0] + - - [384, 448, 64, 512, 384, 384, 512, 512] + - [49, 16160.0] + - - [384, 448, 36, 512, 384, 384, 512, 512] + - [13, 16209.0] + - - [1024, 6400, 1, 65, 1024, 1024, 65, 65] + - [0, 15558.0] + - - [4096, 6400, 1, 256, 4096, 4096, 256, 256] + - [13, 19075.0] + - - [512, 3194, 1, 2048, 512, 512, 2048, 2048] + - [49, 15825.0] + - - [512, 3222, 1, 2048, 512, 512, 2048, 2048] + - [30, 16412.0] + - - [512, 3234, 1, 2048, 512, 512, 2048, 2048] + - [30, 16466.0] + - - [512, 3242, 1, 2048, 512, 512, 2048, 2048] + - [30, 16475.0] + - - [512, 3257, 1, 2048, 512, 512, 2048, 2048] + - [30, 16598.0] + - - [512, 3332, 1, 2048, 512, 512, 2048, 2048] + - [31, 16389.0] + - - [512, 3336, 1, 2048, 512, 512, 2048, 2048] + - [14, 16430.0] + - - [512, 3378, 1, 2048, 512, 512, 2048, 2048] + - [50, 16556.0] + - - [512, 3396, 1, 2048, 512, 512, 2048, 2048] + - [50, 16668.0] + - - [512, 3399, 1, 2048, 512, 512, 2048, 2048] + - [31, 16686.0] + - - [512, 3451, 1, 2048, 512, 512, 2048, 2048] + - [31, 16908.0] + - - [512, 3456, 1, 2048, 512, 512, 2048, 2048] + - [31, 17036.0] + - - [512, 3458, 1, 2048, 512, 512, 2048, 2048] + - [51, 14601.0] + - - [512, 3467, 1, 2048, 512, 512, 2048, 2048] + - [32, 14631.0] + - - [512, 3468, 1, 2048, 512, 512, 2048, 2048] + - [51, 14624.0] + - - [512, 3470, 1, 2048, 512, 512, 2048, 2048] + - [15, 14646.0] + - - [512, 3477, 1, 2048, 512, 512, 2048, 2048] + - [15, 14686.0] + - - [512, 3478, 1, 2048, 512, 512, 2048, 2048] + - [15, 14669.0] + - - [512, 3495, 1, 2048, 512, 512, 2048, 2048] + - [15, 14754.0] + - - [512, 3507, 1, 2048, 512, 512, 2048, 2048] + - [15, 14799.0] + - - [512, 3515, 1, 2048, 512, 512, 2048, 2048] + - [15, 14830.0] + - - [512, 3517, 1, 2048, 512, 512, 2048, 2048] + - [51, 14834.0] + - - [2048, 2864, 1, 512, 2048, 2048, 512, 512] + - [13, 16953.0] + - - [2048, 3287, 1, 512, 2048, 2048, 512, 512] + - [13, 17842.0] + - - [2048, 3412, 1, 512, 2048, 2048, 512, 512] + - [30, 18455.0] + - - [2048, 3456, 1, 512, 2048, 2048, 512, 512] + - [13, 18910.0] + - - [2048, 3466, 1, 512, 2048, 2048, 512, 512] + - [30, 17610.0] + - - [2048, 3476, 1, 512, 2048, 2048, 512, 512] + - [30, 17637.0] + - - [2048, 3999, 1, 512, 2048, 2048, 512, 512] + - [30, 17833.0] + - - [33708, 189, 1, 512, 33708, 33708, 512, 512] + - [29, 15589.0] + - - [33708, 2496, 1, 512, 33708, 33708, 512, 512] + - [13, 19163.0] + - - [33708, 3864, 1, 512, 33708, 33708, 512, 512] + - [13, 19245.0] + - - [33708, 3969, 1, 512, 33708, 33708, 512, 512] + - [13, 19153.0] + - - [33708, 3995, 1, 512, 33708, 33708, 512, 512] + - [13, 19264.0] + - - [134, 134, 240, 64, 134, 134, 64, 64] + - [17, 5998.0] + - - [135, 134, 240, 64, 135, 135, 64, 64] + - [36, 6137.0] + - - [135, 135, 240, 64, 135, 135, 64, 64] + - [25, 6134.0] + - - [512, 2790, 1, 2048, 512, 512, 2048, 2048] + - [48, 14746.0] + - - [512, 2864, 1, 2048, 512, 512, 2048, 2048] + - [30, 14644.0] + - - [512, 3092, 1, 2048, 512, 512, 2048, 2048] + - [13, 15777.0] + - - [512, 3113, 1, 2048, 512, 512, 2048, 2048] + - [30, 15853.0] + - - [512, 3137, 1, 2048, 512, 512, 2048, 2048] + - [13, 15977.0] + - - [512, 3165, 1, 2048, 512, 512, 2048, 2048] + - [13, 16106.0] + - - [512, 3166, 1, 2048, 512, 512, 2048, 2048] + - [49, 16108.0] + - - [512, 3219, 1, 2048, 512, 512, 2048, 2048] + - [13, 16412.0] + - - [512, 3237, 1, 2048, 512, 512, 2048, 2048] + - [49, 16472.0] + - - [512, 3246, 1, 2048, 512, 512, 2048, 2048] + - [13, 16484.0] + - - [512, 3249, 1, 2048, 512, 512, 2048, 2048] + - [30, 16552.0] + - - [512, 3251, 1, 2048, 512, 512, 2048, 2048] + - [49, 16611.0] + - - [512, 3262, 1, 2048, 512, 512, 2048, 2048] + - [30, 16627.0] + - - [512, 3268, 1, 2048, 512, 512, 2048, 2048] + - [13, 16659.0] + - - [512, 3282, 1, 2048, 512, 512, 2048, 2048] + - [13, 16693.0] + - - [512, 3286, 1, 2048, 512, 512, 2048, 2048] + - [30, 16778.0] + - - [512, 3287, 1, 2048, 512, 512, 2048, 2048] + - [30, 16741.0] + - - [512, 3293, 1, 2048, 512, 512, 2048, 2048] + - [30, 16775.0] + - - [512, 3297, 1, 2048, 512, 512, 2048, 2048] + - [30, 16766.0] + - - [512, 3307, 1, 2048, 512, 512, 2048, 2048] + - [13, 16840.0] + - - [512, 3314, 1, 2048, 512, 512, 2048, 2048] + - [49, 16851.0] + - - [512, 3315, 1, 2048, 512, 512, 2048, 2048] + - [49, 16856.0] + - - [512, 3319, 1, 2048, 512, 512, 2048, 2048] + - [30, 16857.0] + - - [512, 3322, 1, 2048, 512, 512, 2048, 2048] + - [49, 16969.0] + - - [512, 3323, 1, 2048, 512, 512, 2048, 2048] + - [30, 16918.0] + - - [512, 3324, 1, 2048, 512, 512, 2048, 2048] + - [49, 16887.0] + - - [512, 3325, 1, 2048, 512, 512, 2048, 2048] + - [49, 16915.0] + - - [512, 3327, 1, 2048, 512, 512, 2048, 2048] + - [49, 16943.0] + - - [512, 3329, 1, 2048, 512, 512, 2048, 2048] + - [31, 16384.0] + - - [512, 3339, 1, 2048, 512, 512, 2048, 2048] + - [50, 16413.0] + - - [512, 3342, 1, 2048, 512, 512, 2048, 2048] + - [31, 16438.0] + - - [512, 3344, 1, 2048, 512, 512, 2048, 2048] + - [50, 16451.0] + - - [512, 3358, 1, 2048, 512, 512, 2048, 2048] + - [14, 16543.0] + - - [512, 3360, 1, 2048, 512, 512, 2048, 2048] + - [50, 16560.0] + - - [512, 3364, 1, 2048, 512, 512, 2048, 2048] + - [31, 16542.0] + - - [512, 3365, 1, 2048, 512, 512, 2048, 2048] + - [31, 16552.0] + - - [512, 3369, 1, 2048, 512, 512, 2048, 2048] + - [50, 16576.0] + - - [512, 3371, 1, 2048, 512, 512, 2048, 2048] + - [50, 16606.0] + - - [512, 3374, 1, 2048, 512, 512, 2048, 2048] + - [31, 16571.0] + - - [512, 3376, 1, 2048, 512, 512, 2048, 2048] + - [31, 16621.0] + - - [512, 3377, 1, 2048, 512, 512, 2048, 2048] + - [50, 16570.0] + - - [512, 3381, 1, 2048, 512, 512, 2048, 2048] + - [50, 16636.0] + - - [512, 3382, 1, 2048, 512, 512, 2048, 2048] + - [31, 16652.0] + - - [512, 3383, 1, 2048, 512, 512, 2048, 2048] + - [31, 16634.0] + - - [512, 3384, 1, 2048, 512, 512, 2048, 2048] + - [31, 16661.0] + - - [512, 3385, 1, 2048, 512, 512, 2048, 2048] + - [50, 16653.0] + - - [512, 3386, 1, 2048, 512, 512, 2048, 2048] + - [50, 16636.0] + - - [512, 3388, 1, 2048, 512, 512, 2048, 2048] + - [50, 16663.0] + - - [512, 3390, 1, 2048, 512, 512, 2048, 2048] + - [50, 16665.0] + - - [512, 3391, 1, 2048, 512, 512, 2048, 2048] + - [31, 16701.0] + - - [512, 3402, 1, 2048, 512, 512, 2048, 2048] + - [31, 16710.0] + - - [512, 3410, 1, 2048, 512, 512, 2048, 2048] + - [31, 16760.0] + - - [512, 3412, 1, 2048, 512, 512, 2048, 2048] + - [14, 16759.0] + - - [512, 3414, 1, 2048, 512, 512, 2048, 2048] + - [50, 16775.0] + - - [512, 3415, 1, 2048, 512, 512, 2048, 2048] + - [50, 16790.0] + - - [512, 3418, 1, 2048, 512, 512, 2048, 2048] + - [14, 16792.0] + - - [512, 3420, 1, 2048, 512, 512, 2048, 2048] + - [14, 16795.0] + - - [512, 3422, 1, 2048, 512, 512, 2048, 2048] + - [50, 16829.0] + - - [512, 3425, 1, 2048, 512, 512, 2048, 2048] + - [31, 16832.0] + - - [512, 3426, 1, 2048, 512, 512, 2048, 2048] + - [31, 16812.0] + - - [512, 3427, 1, 2048, 512, 512, 2048, 2048] + - [31, 16800.0] + - - [512, 3428, 1, 2048, 512, 512, 2048, 2048] + - [50, 16816.0] + - - [512, 3430, 1, 2048, 512, 512, 2048, 2048] + - [14, 16846.0] + - - [512, 3431, 1, 2048, 512, 512, 2048, 2048] + - [50, 16810.0] + - - [512, 3432, 1, 2048, 512, 512, 2048, 2048] + - [50, 16846.0] + - - [512, 3438, 1, 2048, 512, 512, 2048, 2048] + - [50, 16858.0] + - - [512, 3439, 1, 2048, 512, 512, 2048, 2048] + - [50, 16889.0] + - - [512, 3440, 1, 2048, 512, 512, 2048, 2048] + - [31, 16878.0] + - - [512, 3443, 1, 2048, 512, 512, 2048, 2048] + - [31, 16905.0] + - - [512, 3445, 1, 2048, 512, 512, 2048, 2048] + - [31, 16886.0] + - - [512, 3447, 1, 2048, 512, 512, 2048, 2048] + - [31, 16917.0] + - - [512, 3448, 1, 2048, 512, 512, 2048, 2048] + - [31, 16928.0] + - - [512, 3450, 1, 2048, 512, 512, 2048, 2048] + - [50, 16938.0] + - - [512, 3452, 1, 2048, 512, 512, 2048, 2048] + - [31, 16929.0] + - - [512, 3453, 1, 2048, 512, 512, 2048, 2048] + - [31, 16956.0] + - - [512, 3455, 1, 2048, 512, 512, 2048, 2048] + - [31, 16954.0] + - - [512, 3457, 1, 2048, 512, 512, 2048, 2048] + - [15, 14580.0] + - - [512, 3459, 1, 2048, 512, 512, 2048, 2048] + - [15, 14612.0] + - - [512, 3460, 1, 2048, 512, 512, 2048, 2048] + - [51, 14614.0] + - - [512, 3461, 1, 2048, 512, 512, 2048, 2048] + - [15, 14605.0] + - - [512, 3462, 1, 2048, 512, 512, 2048, 2048] + - [51, 14617.0] + - - [512, 3466, 1, 2048, 512, 512, 2048, 2048] + - [32, 14632.0] + - - [512, 3471, 1, 2048, 512, 512, 2048, 2048] + - [51, 14623.0] + - - [512, 3472, 1, 2048, 512, 512, 2048, 2048] + - [32, 14627.0] + - - [512, 3475, 1, 2048, 512, 512, 2048, 2048] + - [15, 14681.0] + - - [512, 3476, 1, 2048, 512, 512, 2048, 2048] + - [15, 14656.0] + - - [512, 3479, 1, 2048, 512, 512, 2048, 2048] + - [51, 14684.0] + - - [512, 3480, 1, 2048, 512, 512, 2048, 2048] + - [51, 14677.0] + - - [512, 3481, 1, 2048, 512, 512, 2048, 2048] + - [51, 14687.0] + - - [512, 3483, 1, 2048, 512, 512, 2048, 2048] + - [51, 14716.0] + - - [512, 3484, 1, 2048, 512, 512, 2048, 2048] + - [32, 14695.0] + - - [512, 3487, 1, 2048, 512, 512, 2048, 2048] + - [51, 14710.0] + - - [512, 3489, 1, 2048, 512, 512, 2048, 2048] + - [15, 14720.0] + - - [512, 3490, 1, 2048, 512, 512, 2048, 2048] + - [51, 14710.0] + - - [512, 3491, 1, 2048, 512, 512, 2048, 2048] + - [15, 14739.0] + - - [512, 3493, 1, 2048, 512, 512, 2048, 2048] + - [51, 14757.0] + - - [512, 3494, 1, 2048, 512, 512, 2048, 2048] + - [51, 14734.0] + - - [512, 3497, 1, 2048, 512, 512, 2048, 2048] + - [15, 14766.0] + - - [512, 3498, 1, 2048, 512, 512, 2048, 2048] + - [51, 14747.0] + - - [512, 3499, 1, 2048, 512, 512, 2048, 2048] + - [51, 14769.0] + - - [512, 3501, 1, 2048, 512, 512, 2048, 2048] + - [51, 14769.0] + - - [512, 3503, 1, 2048, 512, 512, 2048, 2048] + - [51, 14765.0] + - - [512, 3508, 1, 2048, 512, 512, 2048, 2048] + - [15, 14798.0] + - - [512, 3509, 1, 2048, 512, 512, 2048, 2048] + - [15, 14777.0] + - - [512, 3511, 1, 2048, 512, 512, 2048, 2048] + - [51, 14796.0] + - - [512, 3514, 1, 2048, 512, 512, 2048, 2048] + - [15, 14841.0] + - - [512, 3518, 1, 2048, 512, 512, 2048, 2048] + - [15, 14860.0] + - - [512, 3519, 1, 2048, 512, 512, 2048, 2048] + - [15, 14857.0] + - - [512, 3520, 1, 2048, 512, 512, 2048, 2048] + - [15, 14872.0] + - - [512, 3523, 1, 2048, 512, 512, 2048, 2048] + - [15, 14874.0] + - - [512, 3528, 1, 2048, 512, 512, 2048, 2048] + - [15, 14858.0] + - - [512, 3529, 1, 2048, 512, 512, 2048, 2048] + - [15, 14860.0] + - - [512, 3530, 1, 2048, 512, 512, 2048, 2048] + - [15, 14890.0] + - - [512, 3532, 1, 2048, 512, 512, 2048, 2048] + - [51, 14934.0] + - - [512, 3533, 1, 2048, 512, 512, 2048, 2048] + - [15, 14912.0] + - - [512, 3534, 1, 2048, 512, 512, 2048, 2048] + - [15, 14909.0] + - - [512, 3538, 1, 2048, 512, 512, 2048, 2048] + - [15, 14934.0] + - - [512, 3539, 1, 2048, 512, 512, 2048, 2048] + - [32, 14931.0] + - - [512, 3541, 1, 2048, 512, 512, 2048, 2048] + - [15, 14921.0] + - - [512, 3547, 1, 2048, 512, 512, 2048, 2048] + - [15, 14956.0] + - - [512, 3548, 1, 2048, 512, 512, 2048, 2048] + - [15, 14944.0] + - - [512, 3552, 1, 2048, 512, 512, 2048, 2048] + - [15, 14978.0] + - - [512, 3564, 1, 2048, 512, 512, 2048, 2048] + - [15, 15025.0] + - - [512, 3575, 1, 2048, 512, 512, 2048, 2048] + - [15, 15038.0] + - - [512, 3598, 1, 2048, 512, 512, 2048, 2048] + - [51, 15161.0] + - - [512, 3599, 1, 2048, 512, 512, 2048, 2048] + - [32, 15179.0] + - - [512, 3608, 1, 2048, 512, 512, 2048, 2048] + - [51, 15210.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [15, 14667.0] + - - [512, 3780, 1, 2048, 512, 512, 2048, 2048] + - [32, 15955.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [15, 14634.0] + - - [512, 3796, 1, 2048, 512, 512, 2048, 2048] + - [51, 15969.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [15, 14751.0] + - - [512, 3822, 1, 2048, 512, 512, 2048, 2048] + - [15, 16103.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [32, 15530.0] + - - [512, 3840, 1, 2048, 512, 512, 2048, 2048] + - [32, 16353.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [11, 15076.0] + - - [512, 3859, 1, 2048, 512, 512, 2048, 2048] + - [15, 16226.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [11, 15065.0] + - - [512, 3870, 1, 2048, 512, 512, 2048, 2048] + - [51, 16289.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [11, 15143.0] + - - [512, 3876, 1, 2048, 512, 512, 2048, 2048] + - [32, 16312.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [28, 15169.0] + - - [512, 3906, 1, 2048, 512, 512, 2048, 2048] + - [15, 16416.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [15, 15127.0] + - - [512, 3910, 1, 2048, 512, 512, 2048, 2048] + - [15, 16431.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [11, 15216.0] + - - [512, 3925, 1, 2048, 512, 512, 2048, 2048] + - [15, 16490.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [51, 15228.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [11, 15192.0] + - - [512, 3942, 1, 2048, 512, 512, 2048, 2048] + - [32, 16575.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [11, 15285.0] + - - [512, 3944, 1, 2048, 512, 512, 2048, 2048] + - [15, 16581.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [51, 15247.0] + - - [512, 3955, 1, 2048, 512, 512, 2048, 2048] + - [32, 16643.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [26, 15558.0] + - - [512, 3968, 1, 2048, 512, 512, 2048, 2048] + - [51, 16686.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [11, 15373.0] + - - [512, 3969, 1, 2048, 512, 512, 2048, 2048] + - [15, 16667.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [11, 15409.0] + - - [512, 3976, 1, 2048, 512, 512, 2048, 2048] + - [15, 16697.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [11, 15436.0] + - - [512, 3977, 1, 2048, 512, 512, 2048, 2048] + - [15, 16711.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [15, 15344.0] + - - [512, 3978, 1, 2048, 512, 512, 2048, 2048] + - [15, 16698.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [51, 15386.0] + - - [512, 3990, 1, 2048, 512, 512, 2048, 2048] + - [15, 16774.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [32, 15356.0] + - - [512, 3995, 1, 2048, 512, 512, 2048, 2048] + - [15, 16799.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [51, 15396.0] + - - [512, 3996, 1, 2048, 512, 512, 2048, 2048] + - [51, 16791.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [11, 15554.0] + - - [512, 3999, 1, 2048, 512, 512, 2048, 2048] + - [15, 16774.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [51, 15453.0] + - - [512, 4005, 1, 2048, 512, 512, 2048, 2048] + - [15, 16796.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [51, 15439.0] + - - [512, 4012, 1, 2048, 512, 512, 2048, 2048] + - [15, 16856.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [26, 15497.0] + - - [512, 4020, 1, 2048, 512, 512, 2048, 2048] + - [51, 16873.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [11, 15548.0] + - - [512, 4026, 1, 2048, 512, 512, 2048, 2048] + - [15, 16912.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [11, 15563.0] + - - [512, 4030, 1, 2048, 512, 512, 2048, 2048] + - [15, 16944.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [28, 15521.0] + - - [512, 4032, 1, 2048, 512, 512, 2048, 2048] + - [15, 16929.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [51, 15627.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [15, 15606.0] + - - [2048, 2790, 1, 512, 2048, 2048, 512, 512] + - [13, 17913.0] + - - [2048, 3092, 1, 512, 2048, 2048, 512, 512] + - [15, 16929.0] + - - [2048, 3113, 1, 512, 2048, 2048, 512, 512] + - [30, 17106.0] + - - [2048, 3137, 1, 512, 2048, 2048, 512, 512] + - [13, 17235.0] + - - [2048, 3165, 1, 512, 2048, 2048, 512, 512] + - [30, 17334.0] + - - [2048, 3166, 1, 512, 2048, 2048, 512, 512] + - [30, 17385.0] + - - [2048, 3194, 1, 512, 2048, 2048, 512, 512] + - [30, 17511.0] + - - [2048, 3219, 1, 512, 2048, 2048, 512, 512] + - [30, 17476.0] + - - [2048, 3222, 1, 512, 2048, 2048, 512, 512] + - [30, 17513.0] + - - [2048, 3234, 1, 512, 2048, 2048, 512, 512] + - [15, 17565.0] + - - [2048, 3237, 1, 512, 2048, 2048, 512, 512] + - [30, 17656.0] + - - [2048, 3242, 1, 512, 2048, 2048, 512, 512] + - [30, 17676.0] + - - [2048, 3246, 1, 512, 2048, 2048, 512, 512] + - [30, 17623.0] + - - [2048, 3249, 1, 512, 2048, 2048, 512, 512] + - [30, 17740.0] + - - [2048, 3251, 1, 512, 2048, 2048, 512, 512] + - [30, 17733.0] + - - [2048, 3257, 1, 512, 2048, 2048, 512, 512] + - [15, 17716.0] + - - [2048, 3262, 1, 512, 2048, 2048, 512, 512] + - [30, 17745.0] + - - [2048, 3268, 1, 512, 2048, 2048, 512, 512] + - [30, 17801.0] + - - [2048, 3282, 1, 512, 2048, 2048, 512, 512] + - [30, 17835.0] + - - [2048, 3286, 1, 512, 2048, 2048, 512, 512] + - [30, 17892.0] + - - [2048, 3293, 1, 512, 2048, 2048, 512, 512] + - [30, 17937.0] + - - [2048, 3297, 1, 512, 2048, 2048, 512, 512] + - [30, 17954.0] + - - [2048, 3307, 1, 512, 2048, 2048, 512, 512] + - [30, 17969.0] + - - [2048, 3314, 1, 512, 2048, 2048, 512, 512] + - [13, 17916.0] + - - [2048, 3315, 1, 512, 2048, 2048, 512, 512] + - [30, 18061.0] + - - [2048, 3319, 1, 512, 2048, 2048, 512, 512] + - [30, 18010.0] + - - [2048, 3322, 1, 512, 2048, 2048, 512, 512] + - [30, 18024.0] + - - [2048, 3323, 1, 512, 2048, 2048, 512, 512] + - [30, 17983.0] + - - [2048, 3324, 1, 512, 2048, 2048, 512, 512] + - [30, 18095.0] + - - [2048, 3325, 1, 512, 2048, 2048, 512, 512] + - [30, 18039.0] + - - [2048, 3327, 1, 512, 2048, 2048, 512, 512] + - [13, 18027.0] + - - [2048, 3329, 1, 512, 2048, 2048, 512, 512] + - [13, 17986.0] + - - [2048, 3332, 1, 512, 2048, 2048, 512, 512] + - [13, 18039.0] + - - [2048, 3336, 1, 512, 2048, 2048, 512, 512] + - [30, 18115.0] + - - [2048, 3339, 1, 512, 2048, 2048, 512, 512] + - [13, 18003.0] + - - [2048, 3342, 1, 512, 2048, 2048, 512, 512] + - [30, 18077.0] + - - [2048, 3344, 1, 512, 2048, 2048, 512, 512] + - [13, 18101.0] + - - [2048, 3358, 1, 512, 2048, 2048, 512, 512] + - [30, 18218.0] + - - [2048, 3360, 1, 512, 2048, 2048, 512, 512] + - [30, 18240.0] + - - [2048, 3364, 1, 512, 2048, 2048, 512, 512] + - [13, 18209.0] + - - [2048, 3365, 1, 512, 2048, 2048, 512, 512] + - [30, 18197.0] + - - [2048, 3369, 1, 512, 2048, 2048, 512, 512] + - [30, 18262.0] + - - [2048, 3371, 1, 512, 2048, 2048, 512, 512] + - [30, 18273.0] + - - [2048, 3374, 1, 512, 2048, 2048, 512, 512] + - [13, 18252.0] + - - [2048, 3376, 1, 512, 2048, 2048, 512, 512] + - [30, 18329.0] + - - [2048, 3377, 1, 512, 2048, 2048, 512, 512] + - [30, 18289.0] + - - [2048, 3378, 1, 512, 2048, 2048, 512, 512] + - [30, 18347.0] + - - [2048, 3381, 1, 512, 2048, 2048, 512, 512] + - [30, 18320.0] + - - [2048, 3382, 1, 512, 2048, 2048, 512, 512] + - [30, 18340.0] + - - [2048, 3383, 1, 512, 2048, 2048, 512, 512] + - [30, 18325.0] + - - [2048, 3384, 1, 512, 2048, 2048, 512, 512] + - [30, 18420.0] + - - [2048, 3385, 1, 512, 2048, 2048, 512, 512] + - [30, 18364.0] + - - [2048, 3386, 1, 512, 2048, 2048, 512, 512] + - [13, 18286.0] + - - [2048, 3388, 1, 512, 2048, 2048, 512, 512] + - [30, 18419.0] + - - [2048, 3390, 1, 512, 2048, 2048, 512, 512] + - [30, 18391.0] + - - [2048, 3391, 1, 512, 2048, 2048, 512, 512] + - [30, 18387.0] + - - [2048, 3396, 1, 512, 2048, 2048, 512, 512] + - [30, 18439.0] + - - [2048, 3399, 1, 512, 2048, 2048, 512, 512] + - [13, 18374.0] + - - [2048, 3402, 1, 512, 2048, 2048, 512, 512] + - [30, 18493.0] + - - [2048, 3410, 1, 512, 2048, 2048, 512, 512] + - [13, 18452.0] + - - [2048, 3414, 1, 512, 2048, 2048, 512, 512] + - [30, 18589.0] + - - [2048, 3415, 1, 512, 2048, 2048, 512, 512] + - [30, 18567.0] + - - [2048, 3418, 1, 512, 2048, 2048, 512, 512] + - [13, 18522.0] + - - [2048, 3420, 1, 512, 2048, 2048, 512, 512] + - [30, 18546.0] + - - [2048, 3422, 1, 512, 2048, 2048, 512, 512] + - [30, 18573.0] + - - [2048, 3425, 1, 512, 2048, 2048, 512, 512] + - [30, 18554.0] + - - [2048, 3426, 1, 512, 2048, 2048, 512, 512] + - [30, 18541.0] + - - [2048, 3427, 1, 512, 2048, 2048, 512, 512] + - [13, 18525.0] + - - [2048, 3428, 1, 512, 2048, 2048, 512, 512] + - [30, 18576.0] + - - [2048, 3430, 1, 512, 2048, 2048, 512, 512] + - [13, 18516.0] + - - [2048, 3431, 1, 512, 2048, 2048, 512, 512] + - [30, 18631.0] + - - [2048, 3432, 1, 512, 2048, 2048, 512, 512] + - [13, 18546.0] + - - [2048, 3438, 1, 512, 2048, 2048, 512, 512] + - [30, 18550.0] + - - [2048, 3439, 1, 512, 2048, 2048, 512, 512] + - [13, 18592.0] + - - [2048, 3440, 1, 512, 2048, 2048, 512, 512] + - [13, 18540.0] + - - [2048, 3443, 1, 512, 2048, 2048, 512, 512] + - [13, 18560.0] + - - [2048, 3445, 1, 512, 2048, 2048, 512, 512] + - [30, 18561.0] + - - [2048, 3447, 1, 512, 2048, 2048, 512, 512] + - [30, 18685.0] + - - [2048, 3448, 1, 512, 2048, 2048, 512, 512] + - [30, 18638.0] + - - [2048, 3450, 1, 512, 2048, 2048, 512, 512] + - [30, 18588.0] + - - [2048, 3451, 1, 512, 2048, 2048, 512, 512] + - [30, 18659.0] + - - [2048, 3452, 1, 512, 2048, 2048, 512, 512] + - [30, 18681.0] + - - [2048, 3453, 1, 512, 2048, 2048, 512, 512] + - [30, 18639.0] + - - [2048, 3455, 1, 512, 2048, 2048, 512, 512] + - [30, 18732.0] + - - [2048, 3457, 1, 512, 2048, 2048, 512, 512] + - [30, 17508.0] + - - [2048, 3458, 1, 512, 2048, 2048, 512, 512] + - [30, 17527.0] + - - [2048, 3459, 1, 512, 2048, 2048, 512, 512] + - [30, 17527.0] + - - [2048, 3460, 1, 512, 2048, 2048, 512, 512] + - [30, 17441.0] + - - [2048, 3461, 1, 512, 2048, 2048, 512, 512] + - [30, 17551.0] + - - [2048, 3462, 1, 512, 2048, 2048, 512, 512] + - [13, 17434.0] + - - [2048, 3467, 1, 512, 2048, 2048, 512, 512] + - [30, 17564.0] + - - [2048, 3468, 1, 512, 2048, 2048, 512, 512] + - [30, 17600.0] + - - [2048, 3470, 1, 512, 2048, 2048, 512, 512] + - [30, 17629.0] + - - [2048, 3471, 1, 512, 2048, 2048, 512, 512] + - [30, 17522.0] + - - [2048, 3472, 1, 512, 2048, 2048, 512, 512] + - [30, 17595.0] + - - [2048, 3475, 1, 512, 2048, 2048, 512, 512] + - [30, 17579.0] + - - [2048, 3477, 1, 512, 2048, 2048, 512, 512] + - [30, 17622.0] + - - [2048, 3478, 1, 512, 2048, 2048, 512, 512] + - [30, 17750.0] + - - [2048, 3479, 1, 512, 2048, 2048, 512, 512] + - [13, 17527.0] + - - [2048, 3480, 1, 512, 2048, 2048, 512, 512] + - [30, 17644.0] + - - [2048, 3481, 1, 512, 2048, 2048, 512, 512] + - [30, 17637.0] + - - [2048, 3483, 1, 512, 2048, 2048, 512, 512] + - [30, 17591.0] + - - [2048, 3484, 1, 512, 2048, 2048, 512, 512] + - [30, 17664.0] + - - [2048, 3487, 1, 512, 2048, 2048, 512, 512] + - [30, 17643.0] + - - [2048, 3489, 1, 512, 2048, 2048, 512, 512] + - [30, 17594.0] + - - [2048, 3490, 1, 512, 2048, 2048, 512, 512] + - [30, 17713.0] + - - [2048, 3491, 1, 512, 2048, 2048, 512, 512] + - [30, 17606.0] + - - [2048, 3493, 1, 512, 2048, 2048, 512, 512] + - [30, 17761.0] + - - [2048, 3494, 1, 512, 2048, 2048, 512, 512] + - [30, 17730.0] + - - [2048, 3495, 1, 512, 2048, 2048, 512, 512] + - [30, 17733.0] + - - [2048, 3497, 1, 512, 2048, 2048, 512, 512] + - [30, 17750.0] + - - [2048, 3498, 1, 512, 2048, 2048, 512, 512] + - [30, 17735.0] + - - [2048, 3499, 1, 512, 2048, 2048, 512, 512] + - [30, 17694.0] + - - [2048, 3501, 1, 512, 2048, 2048, 512, 512] + - [13, 17651.0] + - - [2048, 3503, 1, 512, 2048, 2048, 512, 512] + - [30, 17753.0] + - - [2048, 3507, 1, 512, 2048, 2048, 512, 512] + - [30, 17738.0] + - - [2048, 3508, 1, 512, 2048, 2048, 512, 512] + - [30, 17842.0] + - - [2048, 3509, 1, 512, 2048, 2048, 512, 512] + - [30, 17868.0] + - - [2048, 3511, 1, 512, 2048, 2048, 512, 512] + - [30, 17842.0] + - - [2048, 3514, 1, 512, 2048, 2048, 512, 512] + - [30, 17768.0] + - - [2048, 3515, 1, 512, 2048, 2048, 512, 512] + - [30, 17835.0] + - - [2048, 3517, 1, 512, 2048, 2048, 512, 512] + - [30, 17821.0] + - - [2048, 3518, 1, 512, 2048, 2048, 512, 512] + - [30, 17845.0] + - - [2048, 3519, 1, 512, 2048, 2048, 512, 512] + - [30, 17848.0] + - - [2048, 3520, 1, 512, 2048, 2048, 512, 512] + - [30, 17836.0] + - - [2048, 3523, 1, 512, 2048, 2048, 512, 512] + - [30, 17894.0] + - - [2048, 3528, 1, 512, 2048, 2048, 512, 512] + - [30, 17863.0] + - - [2048, 3529, 1, 512, 2048, 2048, 512, 512] + - [30, 17863.0] + - - [2048, 3530, 1, 512, 2048, 2048, 512, 512] + - [30, 17906.0] + - - [2048, 3532, 1, 512, 2048, 2048, 512, 512] + - [30, 17885.0] + - - [2048, 3533, 1, 512, 2048, 2048, 512, 512] + - [30, 17914.0] + - - [2048, 3534, 1, 512, 2048, 2048, 512, 512] + - [30, 17807.0] + - - [2048, 3538, 1, 512, 2048, 2048, 512, 512] + - [30, 17884.0] + - - [2048, 3539, 1, 512, 2048, 2048, 512, 512] + - [30, 17879.0] + - - [2048, 3541, 1, 512, 2048, 2048, 512, 512] + - [30, 17875.0] + - - [2048, 3547, 1, 512, 2048, 2048, 512, 512] + - [30, 17994.0] + - - [2048, 3548, 1, 512, 2048, 2048, 512, 512] + - [30, 17864.0] + - - [2048, 3552, 1, 512, 2048, 2048, 512, 512] + - [30, 17972.0] + - - [2048, 3564, 1, 512, 2048, 2048, 512, 512] + - [30, 18059.0] + - - [2048, 3575, 1, 512, 2048, 2048, 512, 512] + - [30, 18116.0] + - - [2048, 3598, 1, 512, 2048, 2048, 512, 512] + - [30, 18046.0] + - - [2048, 3599, 1, 512, 2048, 2048, 512, 512] + - [13, 18063.0] + - - [2048, 3608, 1, 512, 2048, 2048, 512, 512] + - [13, 18097.0] + - - [2048, 3780, 1, 512, 2048, 2048, 512, 512] + - [30, 17820.0] + - - [2048, 3796, 1, 512, 2048, 2048, 512, 512] + - [13, 17897.0] + - - [2048, 3822, 1, 512, 2048, 2048, 512, 512] + - [30, 17935.0] + - - [2048, 3840, 1, 512, 2048, 2048, 512, 512] + - [15, 18314.0] + - - [2048, 3859, 1, 512, 2048, 2048, 512, 512] + - [30, 18149.0] + - - [2048, 3870, 1, 512, 2048, 2048, 512, 512] + - [30, 18176.0] + - - [2048, 3876, 1, 512, 2048, 2048, 512, 512] + - [30, 18186.0] + - - [2048, 3906, 1, 512, 2048, 2048, 512, 512] + - [30, 18378.0] + - - [2048, 3910, 1, 512, 2048, 2048, 512, 512] + - [13, 18289.0] + - - [2048, 3925, 1, 512, 2048, 2048, 512, 512] + - [13, 18362.0] + - - [2048, 3942, 1, 512, 2048, 2048, 512, 512] + - [13, 18420.0] + - - [2048, 3944, 1, 512, 2048, 2048, 512, 512] + - [30, 18514.0] + - - [2048, 3955, 1, 512, 2048, 2048, 512, 512] + - [30, 18562.0] + - - [2048, 3968, 1, 512, 2048, 2048, 512, 512] + - [30, 18722.0] + - - [2048, 3969, 1, 512, 2048, 2048, 512, 512] + - [30, 17529.0] + - - [2048, 3976, 1, 512, 2048, 2048, 512, 512] + - [30, 17611.0] + - - [2048, 3977, 1, 512, 2048, 2048, 512, 512] + - [30, 17631.0] + - - [2048, 3978, 1, 512, 2048, 2048, 512, 512] + - [30, 17609.0] + - - [2048, 3990, 1, 512, 2048, 2048, 512, 512] + - [30, 17749.0] + - - [2048, 3995, 1, 512, 2048, 2048, 512, 512] + - [30, 17704.0] + - - [2048, 3996, 1, 512, 2048, 2048, 512, 512] + - [30, 17827.0] + - - [2048, 4005, 1, 512, 2048, 2048, 512, 512] + - [30, 17745.0] + - - [2048, 4012, 1, 512, 2048, 2048, 512, 512] + - [30, 17772.0] + - - [2048, 4020, 1, 512, 2048, 2048, 512, 512] + - [30, 17789.0] + - - [2048, 4026, 1, 512, 2048, 2048, 512, 512] + - [30, 17873.0] + - - [2048, 4030, 1, 512, 2048, 2048, 512, 512] + - [30, 17836.0] + - - [2048, 4032, 1, 512, 2048, 2048, 512, 512] + - [30, 17841.0] + - - [33708, 184, 1, 512, 33708, 33708, 512, 512] + - [48, 15204.0] + - - [33708, 208, 1, 512, 33708, 33708, 512, 512] + - [13, 15088.0] + - - [33708, 246, 1, 512, 33708, 33708, 512, 512] + - [49, 17755.0] + - - [33708, 264, 1, 512, 33708, 33708, 512, 512] + - [30, 13257.0] + - - [33708, 465, 1, 512, 33708, 33708, 512, 512] + - [49, 17175.0] + - - [33708, 468, 1, 512, 33708, 33708, 512, 512] + - [49, 17262.0] + - - [33708, 493, 1, 512, 33708, 33708, 512, 512] + - [30, 18166.0] + - - [33708, 540, 1, 512, 33708, 33708, 512, 512] + - [30, 16306.0] + - - [33708, 550, 1, 512, 33708, 33708, 512, 512] + - [30, 16617.0] + - - [33708, 560, 1, 512, 33708, 33708, 512, 512] + - [30, 16893.0] + - - [33708, 644, 1, 512, 33708, 33708, 512, 512] + - [49, 16310.0] + - - [33708, 714, 1, 512, 33708, 33708, 512, 512] + - [49, 18040.0] + - - [33708, 720, 1, 512, 33708, 33708, 512, 512] + - [49, 18162.0] + - - [33708, 781, 1, 512, 33708, 33708, 512, 512] + - [49, 16815.0] + - - [33708, 936, 1, 512, 33708, 33708, 512, 512] + - [49, 17750.0] + - - [33708, 980, 1, 512, 33708, 33708, 512, 512] + - [30, 18544.0] + - - [33708, 1232, 1, 512, 33708, 33708, 512, 512] + - [13, 18652.0] + - - [33708, 1290, 1, 512, 33708, 33708, 512, 512] + - [13, 17870.0] + - - [33708, 1350, 1, 512, 33708, 33708, 512, 512] + - [13, 18684.0] + - - [33708, 1424, 1, 512, 33708, 33708, 512, 512] + - [13, 18066.0] + - - [33708, 1458, 1, 512, 33708, 33708, 512, 512] + - [13, 18546.0] + - - [33708, 1462, 1, 512, 33708, 33708, 512, 512] + - [13, 18574.0] + - - [33708, 1520, 1, 512, 33708, 33708, 512, 512] + - [13, 19274.0] + - - [33708, 1596, 1, 512, 33708, 33708, 512, 512] + - [13, 18727.0] + - - [33708, 1599, 1, 512, 33708, 33708, 512, 512] + - [13, 18752.0] + - - [33708, 1615, 1, 512, 33708, 33708, 512, 512] + - [13, 18936.0] + - - [33708, 1680, 1, 512, 33708, 33708, 512, 512] + - [13, 18346.0] + - - [33708, 1917, 1, 512, 33708, 33708, 512, 512] + - [13, 19587.0] + - - [33708, 2205, 1, 512, 33708, 33708, 512, 512] + - [13, 18817.0] + - - [33708, 2418, 1, 512, 33708, 33708, 512, 512] + - [13, 19558.0] + - - [33708, 3776, 1, 512, 33708, 33708, 512, 512] + - [13, 19454.0] + - - [33708, 3780, 1, 512, 33708, 33708, 512, 512] + - [13, 19470.0] + - - [33708, 3796, 1, 512, 33708, 33708, 512, 512] + - [13, 19558.0] + - - [33708, 3822, 1, 512, 33708, 33708, 512, 512] + - [13, 19665.0] + - - [33708, 3835, 1, 512, 33708, 33708, 512, 512] + - [13, 19749.0] + - - [33708, 3840, 1, 512, 33708, 33708, 512, 512] + - [13, 19764.0] + - - [33708, 3859, 1, 512, 33708, 33708, 512, 512] + - [13, 19229.0] + - - [33708, 3870, 1, 512, 33708, 33708, 512, 512] + - [13, 19271.0] + - - [33708, 3876, 1, 512, 33708, 33708, 512, 512] + - [13, 19302.0] + - - [33708, 3906, 1, 512, 33708, 33708, 512, 512] + - [13, 19427.0] + - - [33708, 3910, 1, 512, 33708, 33708, 512, 512] + - [13, 19472.0] + - - [33708, 3925, 1, 512, 33708, 33708, 512, 512] + - [13, 19551.0] + - - [33708, 3942, 1, 512, 33708, 33708, 512, 512] + - [13, 19628.0] + - - [33708, 3944, 1, 512, 33708, 33708, 512, 512] + - [13, 19634.0] + - - [33708, 3955, 1, 512, 33708, 33708, 512, 512] + - [13, 19701.0] + - - [33708, 3968, 1, 512, 33708, 33708, 512, 512] + - [30, 19754.0] + - - [33708, 3976, 1, 512, 33708, 33708, 512, 512] + - [13, 19206.0] + - - [33708, 3977, 1, 512, 33708, 33708, 512, 512] + - [13, 19191.0] + - - [33708, 3978, 1, 512, 33708, 33708, 512, 512] + - [13, 19158.0] + - - [33708, 3990, 1, 512, 33708, 33708, 512, 512] + - [13, 19255.0] + - - [33708, 3996, 1, 512, 33708, 33708, 512, 512] + - [13, 19281.0] + - - [33708, 3999, 1, 512, 33708, 33708, 512, 512] + - [13, 19303.0] + - - [33708, 4005, 1, 512, 33708, 33708, 512, 512] + - [13, 19348.0] + - - [33708, 4012, 1, 512, 33708, 33708, 512, 512] + - [13, 19378.0] + - - [33708, 4020, 1, 512, 33708, 33708, 512, 512] + - [13, 19413.0] + - - [33708, 4026, 1, 512, 33708, 33708, 512, 512] + - [13, 19440.0] + - - [33708, 4030, 1, 512, 33708, 33708, 512, 512] + - [13, 19467.0] + - - [33708, 4032, 1, 512, 33708, 33708, 512, 512] + - [13, 19467.0] + - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] + - [13, 15913.0] + - - [511, 8192, 1, 8192, 511, 511, 8192, 8192] + - [51, 16195.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 19236.0] + - - [8192, 8193, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 17970.0] + - - [3072, 3072, 1, 3071, 3072, 3072, 3071, 3071] + - [24, 20235.0] + - - [8192, 8192, 1, 8193, 8192, 8192, 8193, 8193] + - [7, 20433.0] + - - [7681, 8192, 1, 8192, 7681, 7681, 8192, 8192] + - [51, 19416.0] + - - [7680, 8192, 1, 8193, 7680, 7680, 8193, 8193] + - [24, 20232.0] + - - [513, 4096, 1, 4096, 513, 513, 4096, 4096] + - [13, 13124.0] + - - [3073, 512, 1, 3072, 3073, 3073, 3072, 3072] + - [13, 15808.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [51, 19763.0] + - - [4096, 4096, 1, 4097, 4096, 4096, 4097, 4097] + - [41, 19445.0] + - - [8192, 8191, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 18376.0] + - - [8192, 512, 1, 8193, 8192, 8192, 8193, 8193] + - [43, 17190.0] + - - [2880, 3071, 1, 3072, 2880, 2880, 3072, 3072] + - [51, 18849.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [15, 18882.0] + - - [4096, 511, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 17148.0] + - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] + - [13, 16033.0] + - - [512, 8191, 1, 8192, 512, 512, 8192, 8192] + - [32, 15950.0] + - - [4096, 4095, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 19332.0] + - - [8192, 511, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 15146.0] + - - [8192, 512, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 15828.0] + - - [511, 3072, 1, 3072, 511, 511, 3072, 3072] + - [49, 15755.0] + - - [7680, 8193, 1, 8192, 7680, 7680, 8192, 8192] + - [51, 19158.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 17589.0] + - - [3072, 512, 1, 3073, 3072, 3072, 3073, 3073] + - [37, 16345.0] + - - [513, 8192, 1, 8192, 513, 513, 8192, 8192] + - [32, 13936.0] + - - [7679, 8192, 1, 8192, 7679, 7679, 8192, 8192] + - [51, 19834.0] + - - [3840, 4096, 1, 4097, 3840, 3840, 4097, 4097] + - [22, 19558.0] + - - [512, 3072, 1, 3071, 512, 512, 3071, 3071] + - [37, 16543.0] + - - [7680, 8192, 1, 8191, 7680, 7680, 8191, 8191] + - [24, 20235.0] + - - [3072, 511, 1, 3072, 3072, 3072, 3072, 3072] + - [30, 15712.0] + - - [8193, 8192, 1, 8192, 8193, 8193, 8192, 8192] + - [51, 18896.0] + - - [512, 4096, 1, 4095, 512, 512, 4095, 4095] + - [43, 17541.0] + - - [512, 3071, 1, 3072, 512, 512, 3072, 3072] + - [49, 15843.0] + - - [3073, 3072, 1, 3072, 3073, 3073, 3072, 3072] + - [13, 18454.0] + - - [512, 3073, 1, 3072, 512, 512, 3072, 3072] + - [49, 15818.0] + - - [4096, 4096, 1, 4095, 4096, 4096, 4095, 4095] + - [22, 19485.0] + - - [1920, 2048, 1, 2047, 1920, 1920, 2047, 2047] + - [37, 18722.0] + - - [1920, 2049, 1, 2048, 1920, 1920, 2048, 2048] + - [13, 15963.0] + - - [512, 8192, 1, 8191, 512, 512, 8191, 8191] + - [7, 17978.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [15, 19258.0] + - - [8191, 512, 1, 8192, 8191, 8191, 8192, 8192] + - [32, 15153.0] + - - [2881, 3072, 1, 3072, 2881, 2881, 3072, 3072] + - [32, 18878.0] + - - [512, 4096, 1, 4096, 512, 512, 4096, 4096] + - [15, 17494.0] + - - [3841, 4096, 1, 4096, 3841, 3841, 4096, 4096] + - [15, 19014.0] + - - [2880, 3072, 1, 3073, 2880, 2880, 3073, 3073] + - [7, 18877.0] + - - [4095, 512, 1, 4096, 4095, 4095, 4096, 4096] + - [15, 17330.0] + - - [1919, 2048, 1, 2048, 1919, 1919, 2048, 2048] + - [13, 18019.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [13, 18102.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 18306.0] + - - [511, 4096, 1, 4096, 511, 511, 4096, 4096] + - [51, 17345.0] + - - [8192, 513, 1, 8192, 8192, 8192, 8192, 8192] + - [44, 12469.0] + - - [513, 3072, 1, 3072, 513, 513, 3072, 3072] + - [51, 13042.0] + - - [7680, 8191, 1, 8192, 7680, 7680, 8192, 8192] + - [51, 19791.0] + - - [512, 4097, 1, 4096, 512, 512, 4096, 4096] + - [32, 17282.0] + - - [2047, 2048, 1, 2048, 2047, 2047, 2048, 2048] + - [15, 17623.0] + - - [2049, 2048, 1, 2048, 2049, 2049, 2048, 2048] + - [13, 16951.0] + - - [3840, 4095, 1, 4096, 3840, 3840, 4096, 4096] + - [15, 18982.0] + - - [2880, 3072, 1, 3071, 2880, 2880, 3071, 3071] + - [24, 18900.0] + - - [3072, 3072, 1, 3073, 3072, 3072, 3073, 3073] + - [7, 20235.0] + - - [2880, 3073, 1, 3072, 2880, 2880, 3072, 3072] + - [13, 18309.0] + - - [4096, 513, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 13372.0] + - - [4097, 512, 1, 4096, 4097, 4097, 4096, 4096] + - [15, 17422.0] + - - [8192, 512, 1, 8191, 8192, 8192, 8191, 8191] + - [24, 17192.0] + - - [1921, 2048, 1, 2048, 1921, 1921, 2048, 2048] + - [15, 16486.0] + - - [512, 3072, 1, 3073, 512, 512, 3073, 3073] + - [37, 16591.0] + - - [2048, 2049, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 17573.0] + - - [3072, 512, 1, 3071, 3072, 3072, 3071, 3071] + - [18, 16599.0] + - - [3071, 3072, 1, 3072, 3071, 3071, 3072, 3072] + - [32, 20192.0] + - - [3840, 4097, 1, 4096, 3840, 3840, 4096, 4096] + - [15, 18085.0] + - - [2048, 2047, 1, 2048, 2048, 2048, 2048, 2048] + - [32, 17425.0] + - - [2879, 3072, 1, 3072, 2879, 2879, 3072, 3072] + - [32, 18891.0] + - - [3072, 513, 1, 3072, 3072, 3072, 3072, 3072] + - [51, 13121.0] + - - [512, 4095, 1, 4096, 512, 512, 4096, 4096] + - [51, 17364.0] + - - [3071, 512, 1, 3072, 3071, 3071, 3072, 3072] + - [13, 15867.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 17538.0] + - - [4097, 4096, 1, 4096, 4097, 4097, 4096, 4096] + - [15, 18729.0] + - - [2048, 2048, 1, 2047, 2048, 2048, 2047, 2047] + - [24, 17700.0] + - - [3839, 4096, 1, 4096, 3839, 3839, 4096, 4096] + - [15, 19090.0] + - - [512, 4096, 1, 4097, 512, 512, 4097, 4097] + - [43, 17528.0] + - - [3072, 3073, 1, 3072, 3072, 3072, 3072, 3072] + - [13, 18441.0] + - - [2048, 2048, 1, 2049, 2048, 2048, 2049, 2049] + - [24, 17763.0] + - - [8191, 8192, 1, 8192, 8191, 8191, 8192, 8192] + - [51, 18268.0] + - - [3072, 3071, 1, 3072, 3072, 3072, 3072, 3072] + - [32, 20121.0] + - - [4096, 512, 1, 4097, 4096, 4096, 4097, 4097] + - [43, 17502.0] + - - [3840, 4096, 1, 4095, 3840, 3840, 4095, 4095] + - [5, 19560.0] + - - [1920, 2047, 1, 2048, 1920, 1920, 2048, 2048] + - [13, 17827.0] + - - [8192, 8192, 1, 8191, 8192, 8192, 8191, 8191] + - [7, 20436.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [32, 20204.0] + - - [512, 8193, 1, 8192, 512, 512, 8192, 8192] + - [32, 16213.0] + - - [4096, 512, 1, 4095, 4096, 4096, 4095, 4095] + - [43, 17491.0] + - - [8193, 512, 1, 8192, 8193, 8193, 8192, 8192] + - [32, 15218.0] + - - [4095, 4096, 1, 4096, 4095, 4095, 4096, 4096] + - [13, 19253.0] + - - [4096, 4097, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 18726.0] + - - [512, 8192, 1, 8192, 512, 512, 8192, 8192] + - [32, 16515.0] + - - [512, 8192, 1, 8193, 512, 512, 8193, 8193] + - [24, 17982.0] + - - [1920, 2048, 1, 2049, 1920, 1920, 2049, 2049] + - [37, 18641.0] + - - [479, 3072, 1, 3072, 479, 479, 3072, 3072] + - [13, 14920.0] + - - [479, 4096, 1, 4096, 479, 479, 4096, 4096] + - [32, 16363.0] + - - [479, 8192, 1, 8192, 479, 479, 8192, 8192] + - [32, 15191.0] + - - [480, 3072, 1, 3071, 480, 480, 3071, 3071] + - [37, 15545.0] + - - [480, 3072, 1, 3073, 480, 480, 3073, 3073] + - [1, 15544.0] + - - [480, 3073, 1, 3072, 480, 480, 3072, 3072] + - [13, 14954.0] + - - [480, 4095, 1, 4096, 480, 480, 4096, 4096] + - [32, 16366.0] + - - [480, 4096, 1, 4095, 480, 480, 4095, 4095] + - [24, 16504.0] + - - [480, 4096, 1, 4097, 480, 480, 4097, 4097] + - [43, 16494.0] + - - [480, 4097, 1, 4096, 480, 480, 4096, 4096] + - [32, 16394.0] + - - [480, 8191, 1, 8192, 480, 480, 8192, 8192] + - [32, 15180.0] + - - [480, 8192, 1, 8191, 480, 480, 8191, 8191] + - [43, 16857.0] + - - [480, 8192, 1, 8193, 480, 480, 8193, 8193] + - [7, 16855.0] + - - [480, 8193, 1, 8192, 480, 480, 8192, 8192] + - [51, 15252.0] + - - [481, 3072, 1, 3072, 481, 481, 3072, 3072] + - [13, 14852.0] + - - [481, 4096, 1, 4096, 481, 481, 4096, 4096] + - [51, 16423.0] + - - [481, 8192, 1, 8192, 481, 481, 8192, 8192] + - [51, 15672.0] + - - [3072, 479, 1, 3072, 3072, 3072, 3072, 3072] + - [30, 14801.0] + - - [3072, 480, 1, 3071, 3072, 3072, 3071, 3071] + - [37, 15532.0] + - - [3072, 480, 1, 3073, 3072, 3072, 3073, 3073] + - [18, 15452.0] + - - [3072, 481, 1, 3072, 3072, 3072, 3072, 3072] + - [30, 14937.0] + - - [3073, 480, 1, 3072, 3073, 3073, 3072, 3072] + - [49, 14901.0] + - - [480, 3072, 1, 3072, 480, 480, 3072, 3072] + - [30, 14909.0] + - - [480, 4096, 1, 4096, 480, 480, 4096, 4096] + - [51, 16423.0] + - - [480, 8192, 1, 8192, 480, 480, 8192, 8192] + - [32, 15450.0] + - - [3072, 480, 1, 3072, 3072, 3072, 3072, 3072] + - [49, 14819.0] + - - [4096, 480, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 16178.0] + - - [8192, 480, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 14474.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 17697.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16777.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18420.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18678.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18358.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 18284.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [49, 18204.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17449.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18719.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18345.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19309.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19310.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19285.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19306.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19111.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19301.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19116.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19242.0] + - - [42720, 3968, 1, 1024, 42720, 42720, 1024, 1024] + - [49, 19908.0] + - - [42720, 7200, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 19663.0] + - - [42720, 9520, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 19855.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [51, 15966.0] + - - [2048, 960, 1, 74, 2048, 2048, 74, 74] + - [0, 12852.0] + - - [1600, 1024, 1, 960, 1600, 1600, 960, 960] + - [18, 16658.0] + - - [2048, 2048, 1, 960, 2048, 2048, 960, 960] + - [24, 17583.0] + - - [4096, 1024, 1, 257, 4096, 4096, 257, 257] + - [18, 16283.0] + - - [10240, 8976, 1, 256, 10240, 10240, 256, 256] + - [13, 19197.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [49, 15398.0] + - - [1024, 1600, 1, 560, 1024, 1024, 560, 560] + - [38, 16148.0] + - - [10496, 8976, 1, 256, 10496, 10496, 256, 256] + - [13, 19307.0] + - - [11264, 8976, 1, 256, 11264, 11264, 256, 256] + - [13, 19194.0] + - - [11776, 8976, 1, 256, 11776, 11776, 256, 256] + - [13, 19338.0] + - - [12544, 8976, 1, 256, 12544, 12544, 256, 256] + - [13, 19346.0] + - - [1280, 8976, 1, 256, 1280, 1280, 256, 256] + - [49, 17878.0] + - - [13312, 8976, 1, 256, 13312, 13312, 256, 256] + - [13, 19333.0] + - - [13568, 8976, 1, 256, 13568, 13568, 256, 256] + - [13, 19391.0] + - - [13824, 8976, 1, 256, 13824, 13824, 256, 256] + - [13, 19348.0] + - - [15104, 8976, 1, 256, 15104, 15104, 256, 256] + - [13, 19398.0] + - - [15360, 8976, 1, 256, 15360, 15360, 256, 256] + - [13, 19335.0] + - - [15872, 8976, 1, 256, 15872, 15872, 256, 256] + - [13, 19383.0] + - - [16128, 8976, 1, 256, 16128, 16128, 256, 256] + - [13, 19427.0] + - - [17152, 8976, 1, 256, 17152, 17152, 256, 256] + - [13, 19446.0] + - - [1792, 8976, 1, 256, 1792, 1792, 256, 256] + - [49, 18264.0] + - - [18176, 8976, 1, 256, 18176, 18176, 256, 256] + - [13, 19447.0] + - - [18688, 8976, 1, 256, 18688, 18688, 256, 256] + - [13, 19458.0] + - - [18944, 8976, 1, 256, 18944, 18944, 256, 256] + - [13, 19442.0] + - - [19712, 8976, 1, 256, 19712, 19712, 256, 256] + - [13, 19441.0] + - - [19968, 8976, 1, 256, 19968, 19968, 256, 256] + - [13, 19436.0] + - - [20480, 8976, 1, 256, 20480, 20480, 256, 256] + - [13, 19391.0] + - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] + - [15, 16113.0] + - - [2048, 1536, 1, 768, 2048, 2048, 768, 768] + - [32, 17047.0] + - - [2048, 684, 1, 512, 2048, 2048, 512, 512] + - [29, 13788.0] + - - [2048, 684, 1, 768, 2048, 2048, 768, 768] + - [29, 14111.0] + - - [2048, 8976, 1, 256, 2048, 2048, 256, 256] + - [30, 18638.0] + - - [20992, 8976, 1, 256, 20992, 20992, 256, 256] + - [13, 19446.0] + - - [21248, 8976, 1, 256, 21248, 21248, 256, 256] + - [13, 19445.0] + - - [2304, 8976, 1, 256, 2304, 2304, 256, 256] + - [24, 18553.0] + - - [23552, 8976, 1, 256, 23552, 23552, 256, 256] + - [13, 19418.0] + - - [2560, 8976, 1, 256, 2560, 2560, 256, 256] + - [30, 18710.0] + - - [256, 10496, 1, 1024, 256, 256, 1024, 1024] + - [49, 16639.0] + - - [256, 11264, 1, 1024, 256, 256, 1024, 1024] + - [49, 18055.0] + - - [256, 11520, 1, 1024, 256, 256, 1024, 1024] + - [14, 17106.0] + - - [256, 11776, 1, 1024, 256, 256, 1024, 1024] + - [51, 16581.0] + - - [256, 12544, 1, 1024, 256, 256, 1024, 1024] + - [51, 17596.0] + - - [256, 13312, 1, 1024, 256, 256, 1024, 1024] + - [32, 18560.0] + - - [256, 14336, 1, 1024, 256, 256, 1024, 1024] + - [30, 16845.0] + - - [256, 14592, 1, 1024, 256, 256, 1024, 1024] + - [30, 17160.0] + - - [256, 14848, 1, 1024, 256, 256, 1024, 1024] + - [30, 17442.0] + - - [256, 15104, 1, 1024, 256, 256, 1024, 1024] + - [49, 17735.0] + - - [256, 16128, 1, 1024, 256, 256, 1024, 1024] + - [14, 17238.0] + - - [256, 18176, 1, 1024, 256, 256, 1024, 1024] + - [32, 19200.0] + - - [256, 18944, 1, 1024, 256, 256, 1024, 1024] + - [49, 17522.0] + - - [256, 19200, 1, 1024, 256, 256, 1024, 1024] + - [49, 17754.0] + - - [256, 20480, 1, 1024, 256, 256, 1024, 1024] + - [49, 18897.0] + - - [256, 20992, 1, 1024, 256, 256, 1024, 1024] + - [51, 17937.0] + - - [256, 21248, 1, 1024, 256, 256, 1024, 1024] + - [51, 18174.0] + - - [256, 21504, 1, 1024, 256, 256, 1024, 1024] + - [32, 18373.0] + - - [256, 22016, 1, 1024, 256, 256, 1024, 1024] + - [51, 18776.0] + - - [256, 22344, 1, 1024, 256, 256, 1024, 1024] + - [32, 18655.0] + - - [256, 23296, 1, 1024, 256, 256, 1024, 1024] + - [30, 17763.0] + - - [256, 23552, 1, 1024, 256, 256, 1024, 1024] + - [30, 17930.0] + - - [256, 31488, 1, 1024, 256, 256, 1024, 1024] + - [32, 18861.0] + - - [256, 33536, 1, 1024, 256, 256, 1024, 1024] + - [49, 18641.0] + - - [256, 44505, 1, 1024, 256, 256, 1024, 1024] + - [51, 18497.0] + - - [256, 4608, 1, 1024, 256, 256, 1024, 1024] + - [32, 16033.0] + - - [256, 4864, 1, 1024, 256, 256, 1024, 1024] + - [45, 13265.0] + - - [256, 5376, 1, 1024, 256, 256, 1024, 1024] + - [48, 14207.0] + - - [256, 5888, 1, 1024, 256, 256, 1024, 1024] + - [49, 15059.0] + - - [256, 6144, 1, 1024, 256, 256, 1024, 1024] + - [13, 15689.0] + - - [256, 6400, 1, 1024, 256, 256, 1024, 1024] + - [30, 16292.0] + - - [256, 6656, 1, 1024, 256, 256, 1024, 1024] + - [13, 16914.0] + - - [256, 7168, 1, 1024, 256, 256, 1024, 1024] + - [51, 15064.0] + - - [256, 7424, 1, 1024, 256, 256, 1024, 1024] + - [32, 15594.0] + - - [256, 7936, 1, 1024, 256, 256, 1024, 1024] + - [32, 16569.0] + - - [256, 8192, 1, 1024, 256, 256, 1024, 1024] + - [32, 16998.0] + - - [256, 8448, 1, 1024, 256, 256, 1024, 1024] + - [32, 17585.0] + - - [256, 8960, 1, 1024, 256, 256, 1024, 1024] + - [51, 18509.0] + - - [256, 9984, 1, 1024, 256, 256, 1024, 1024] + - [49, 16110.0] + - - [2816, 8976, 1, 256, 2816, 2816, 256, 256] + - [49, 18717.0] + - - [28672, 8976, 1, 256, 28672, 28672, 256, 256] + - [13, 19453.0] + - - [3072, 8976, 1, 256, 3072, 3072, 256, 256] + - [30, 18806.0] + - - [31488, 8976, 1, 256, 31488, 31488, 256, 256] + - [13, 19510.0] + - - [3328, 8976, 1, 256, 3328, 3328, 256, 256] + - [49, 18845.0] + - - [33536, 8976, 1, 256, 33536, 33536, 256, 256] + - [13, 19525.0] + - - [3840, 8976, 1, 256, 3840, 3840, 256, 256] + - [30, 18810.0] + - - [4096, 8976, 1, 256, 4096, 4096, 256, 256] + - [30, 18730.0] + - - [4352, 8976, 1, 256, 4352, 4352, 256, 256] + - [13, 18943.0] + - - [44505, 8976, 1, 256, 44505, 44505, 256, 256] + - [13, 19488.0] + - - [4608, 8976, 1, 256, 4608, 4608, 256, 256] + - [30, 18900.0] + - - [4864, 8976, 1, 256, 4864, 4864, 256, 256] + - [30, 18976.0] + - - [5120, 8976, 1, 256, 5120, 5120, 256, 256] + - [13, 18578.0] + - - [5376, 8976, 1, 256, 5376, 5376, 256, 256] + - [41, 19037.0] + - - [5632, 8976, 1, 256, 5632, 5632, 256, 256] + - [30, 19017.0] + - - [5888, 8976, 1, 256, 5888, 5888, 256, 256] + - [13, 19073.0] + - - [6144, 8976, 1, 256, 6144, 6144, 256, 256] + - [13, 18828.0] + - - [6400, 8976, 1, 256, 6400, 6400, 256, 256] + - [13, 19131.0] + - - [684, 8976, 1, 256, 684, 684, 256, 256] + - [49, 15415.0] + - - [7168, 8976, 1, 256, 7168, 7168, 256, 256] + - [13, 19001.0] + - - [7936, 8976, 1, 256, 7936, 7936, 256, 256] + - [13, 19183.0] + - - [8192, 8976, 1, 256, 8192, 8192, 256, 256] + - [5, 18744.0] + - - [8448, 8976, 1, 256, 8448, 8448, 256, 256] + - [13, 19270.0] + - - [8960, 8976, 1, 256, 8960, 8960, 256, 256] + - [13, 19279.0] + - - [9472, 8976, 1, 256, 9472, 9472, 256, 256] + - [13, 19281.0] + - - [9728, 8976, 1, 256, 9728, 9728, 256, 256] + - [13, 19268.0] + - - [9984, 8976, 1, 256, 9984, 9984, 256, 256] + - [13, 19291.0] + - - [512, 32768, 1, 13, 512, 512, 13, 13] + - [36, 6464.0] + - - [256, 32768, 1, 512, 256, 256, 512, 512] + - [49, 18171.0] + - - [128, 32768, 1, 512, 128, 128, 512, 512] + - [15, 17000.0] + - - [1024, 32768, 1, 479, 1024, 1024, 479, 479] + - [1, 19733.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 19511.0] + - - [512, 32768, 1, 1024, 512, 512, 1024, 1024] + - [49, 19213.0] + - - [1023, 2048, 1, 4096, 1023, 1023, 4096, 4096] + - [32, 17372.0] + - - [1025, 2048, 1, 4096, 1025, 1025, 4096, 4096] + - [32, 17483.0] + - - [1024, 2047, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17405.0] + - - [1024, 2049, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17450.0] + - - [1024, 2048, 1, 4095, 1024, 1024, 4095, 4095] + - [24, 17620.0] + - - [1024, 2048, 1, 4097, 1024, 1024, 4097, 4097] + - [24, 17606.0] + - - [1023, 3072, 1, 1024, 1023, 1023, 1024, 1024] + - [32, 16946.0] + - - [1025, 3072, 1, 1024, 1025, 1025, 1024, 1024] + - [32, 17087.0] + - - [1024, 3071, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 16875.0] + - - [1024, 3073, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 16949.0] + - - [1024, 3072, 1, 1023, 1024, 1024, 1023, 1023] + - [1, 17259.0] + - - [1024, 3072, 1, 1025, 1024, 1024, 1025, 1025] + - [24, 17280.0] + - - [3071, 512, 1, 1024, 3071, 3071, 1024, 1024] + - [13, 15124.0] + - - [3073, 512, 1, 1024, 3073, 3073, 1024, 1024] + - [49, 15111.0] + - - [3072, 511, 1, 1024, 3072, 3072, 1024, 1024] + - [49, 15148.0] + - - [3072, 513, 1, 1024, 3072, 3072, 1024, 1024] + - [48, 13602.0] + - - [3072, 512, 1, 1023, 3072, 3072, 1023, 1023] + - [37, 16230.0] + - - [3072, 512, 1, 1025, 3072, 3072, 1025, 1025] + - [18, 16054.0] + - - [128, 32768, 1, 256, 128, 128, 256, 256] + - [30, 16220.0] + - - [1024, 4096, 1, 480, 1024, 1024, 480, 480] + - [1, 17356.0] + - - [512, 4096, 1, 1024, 512, 512, 1024, 1024] + - [32, 17109.0] + - - [512, 55296, 1, 13, 512, 512, 13, 13] + - [39, 6783.0] + - - [256, 55296, 1, 512, 256, 256, 512, 512] + - [49, 19138.0] + - - [128, 55296, 1, 256, 128, 128, 256, 256] + - [30, 18148.0] + - - [1024, 6912, 1, 480, 1024, 1024, 480, 480] + - [1, 19645.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19599.0] + - - [512, 6912, 1, 1024, 512, 512, 1024, 1024] + - [15, 19170.0] + - - [256, 6912, 1, 512, 256, 256, 512, 512] + - [10, 16472.0] + - - [1151, 1152, 1, 1152, 1151, 1151, 1152, 1152] + - [48, 13460.0] + - - [1153, 1152, 1, 1152, 1153, 1153, 1152, 1152] + - [46, 13413.0] + - - [1152, 1151, 1, 1152, 1152, 1152, 1152, 1152] + - [40, 13561.0] + - - [1152, 1153, 1, 1152, 1152, 1152, 1152, 1152] + - [21, 13628.0] + - - [1152, 1152, 1, 1151, 1152, 1152, 1151, 1151] + - [17, 15190.0] + - - [1152, 1152, 1, 1153, 1152, 1152, 1153, 1153] + - [17, 15250.0] + - - [1535, 1536, 1, 1536, 1535, 1535, 1536, 1536] + - [15, 18899.0] + - - [1537, 1536, 1, 1536, 1537, 1537, 1536, 1536] + - [49, 14793.0] + - - [1536, 1535, 1, 1536, 1536, 1536, 1536, 1536] + - [51, 18666.0] + - - [1536, 1537, 1, 1536, 1536, 1536, 1536, 1536] + - [49, 14829.0] + - - [1536, 1536, 1, 1535, 1536, 1536, 1535, 1535] + - [24, 19265.0] + - - [1536, 1536, 1, 1537, 1536, 1536, 1537, 1537] + - [7, 19223.0] + - - [1919, 1920, 1, 1920, 1919, 1919, 1920, 1920] + - [5, 17006.0] + - - [1921, 1920, 1, 1920, 1921, 1921, 1920, 1920] + - [49, 16998.0] + - - [1920, 1919, 1, 1920, 1920, 1920, 1920, 1920] + - [41, 17012.0] + - - [1920, 1921, 1, 1920, 1920, 1920, 1920, 1920] + - [41, 17003.0] + - - [1920, 1920, 1, 1919, 1920, 1920, 1919, 1919] + - [37, 17676.0] + - - [1920, 1920, 1, 1921, 1920, 1920, 1921, 1921] + - [18, 17619.0] + - - [2303, 2304, 1, 2304, 2303, 2303, 2304, 2304] + - [32, 17986.0] + - - [2305, 2304, 1, 2304, 2305, 2305, 2304, 2304] + - [51, 18033.0] + - - [2304, 2303, 1, 2304, 2304, 2304, 2304, 2304] + - [32, 17914.0] + - - [2304, 2305, 1, 2304, 2304, 2304, 2304, 2304] + - [51, 18020.0] + - - [2304, 2304, 1, 2303, 2304, 2304, 2303, 2303] + - [19, 18610.0] + - - [2304, 2304, 1, 2305, 2304, 2304, 2305, 2305] + - [38, 18580.0] + - - [2687, 2688, 1, 2688, 2687, 2687, 2688, 2688] + - [22, 18421.0] + - - [2689, 2688, 1, 2688, 2689, 2689, 2688, 2688] + - [41, 18419.0] + - - [2688, 2687, 1, 2688, 2688, 2688, 2688, 2688] + - [41, 18429.0] + - - [2688, 2689, 1, 2688, 2688, 2688, 2688, 2688] + - [41, 18420.0] + - - [2688, 2688, 1, 2687, 2688, 2688, 2687, 2687] + - [37, 19061.0] + - - [2688, 2688, 1, 2689, 2688, 2688, 2689, 2689] + - [37, 19011.0] + - - [3455, 3456, 1, 3456, 3455, 3455, 3456, 3456] + - [41, 19093.0] + - - [3457, 3456, 1, 3456, 3457, 3457, 3456, 3456] + - [24, 18773.0] + - - [3456, 3455, 1, 3456, 3456, 3456, 3456, 3456] + - [22, 19110.0] + - - [3456, 3457, 1, 3456, 3456, 3456, 3456, 3456] + - [24, 18644.0] + - - [3456, 3456, 1, 3455, 3456, 3456, 3455, 3455] + - [37, 19720.0] + - - [3456, 3456, 1, 3457, 3456, 3456, 3457, 3457] + - [18, 19695.0] + - - [3839, 3840, 1, 3840, 3839, 3839, 3840, 3840] + - [15, 19334.0] + - - [3841, 3840, 1, 3840, 3841, 3841, 3840, 3840] + - [7, 19339.0] + - - [3840, 3839, 1, 3840, 3840, 3840, 3840, 3840] + - [32, 19339.0] + - - [3840, 3841, 1, 3840, 3840, 3840, 3840, 3840] + - [41, 19009.0] + - - [3840, 3840, 1, 3839, 3840, 3840, 3839, 3839] + - [3, 19375.0] + - - [3840, 3840, 1, 3841, 3840, 3840, 3841, 3841] + - [24, 19341.0] + - - [4223, 4224, 1, 4224, 4223, 4223, 4224, 4224] + - [22, 19421.0] + - - [4225, 4224, 1, 4224, 4225, 4225, 4224, 4224] + - [22, 18844.0] + - - [4224, 4223, 1, 4224, 4224, 4224, 4224, 4224] + - [5, 19427.0] + - - [4224, 4225, 1, 4224, 4224, 4224, 4224, 4224] + - [24, 19380.0] + - - [4224, 4224, 1, 4223, 4224, 4224, 4223, 4223] + - [18, 20036.0] + - - [4224, 4224, 1, 4225, 4224, 4224, 4225, 4225] + - [18, 20023.0] + - - [4607, 4608, 1, 4608, 4607, 4607, 4608, 4608] + - [32, 20381.0] + - - [4609, 4608, 1, 4608, 4609, 4609, 4608, 4608] + - [32, 19385.0] + - - [4608, 4607, 1, 4608, 4608, 4608, 4608, 4608] + - [51, 20377.0] + - - [4608, 4609, 1, 4608, 4608, 4608, 4608, 4608] + - [32, 19353.0] + - - [4608, 4608, 1, 4607, 4608, 4608, 4607, 4607] + - [24, 20465.0] + - - [4608, 4608, 1, 4609, 4608, 4608, 4609, 4609] + - [24, 20462.0] + - - [4991, 4992, 1, 4992, 4991, 4991, 4992, 4992] + - [24, 19685.0] + - - [4993, 4992, 1, 4992, 4993, 4993, 4992, 4992] + - [22, 19169.0] + - - [4992, 4991, 1, 4992, 4992, 4992, 4992, 4992] + - [24, 19689.0] + - - [4992, 4993, 1, 4992, 4992, 4992, 4992, 4992] + - [7, 19706.0] + - - [4992, 4992, 1, 4991, 4992, 4992, 4991, 4991] + - [18, 20218.0] + - - [4992, 4992, 1, 4993, 4992, 4992, 4993, 4993] + - [18, 20191.0] + - - [5375, 5376, 1, 5376, 5375, 5375, 5376, 5376] + - [7, 20109.0] + - - [5377, 5376, 1, 5376, 5377, 5377, 5376, 5376] + - [24, 19383.0] + - - [5376, 5375, 1, 5376, 5376, 5376, 5376, 5376] + - [7, 20084.0] + - - [5376, 5377, 1, 5376, 5376, 5376, 5376, 5376] + - [51, 19273.0] + - - [5376, 5376, 1, 5375, 5376, 5376, 5375, 5375] + - [24, 20082.0] + - - [5376, 5376, 1, 5377, 5376, 5376, 5377, 5377] + - [24, 20080.0] + - - [5759, 5760, 1, 5760, 5759, 5759, 5760, 5760] + - [24, 19861.0] + - - [5761, 5760, 1, 5760, 5761, 5761, 5760, 5760] + - [30, 19382.0] + - - [5760, 5759, 1, 5760, 5760, 5760, 5760, 5760] + - [43, 19851.0] + - - [5760, 5761, 1, 5760, 5760, 5760, 5760, 5760] + - [43, 19856.0] + - - [5760, 5760, 1, 5759, 5760, 5760, 5759, 5759] + - [37, 20011.0] + - - [5760, 5760, 1, 5761, 5760, 5760, 5761, 5761] + - [24, 19818.0] + - - [6143, 6144, 1, 6144, 6143, 6143, 6144, 6144] + - [51, 19711.0] + - - [6145, 6144, 1, 6144, 6145, 6145, 6144, 6144] + - [51, 19477.0] + - - [6144, 6143, 1, 6144, 6144, 6144, 6144, 6144] + - [51, 19641.0] + - - [6144, 6145, 1, 6144, 6144, 6144, 6144, 6144] + - [51, 18858.0] + - - [6144, 6144, 1, 6143, 6144, 6144, 6143, 6143] + - [24, 20409.0] + - - [6144, 6144, 1, 6145, 6144, 6144, 6145, 6145] + - [7, 20406.0] + - - [6527, 6528, 1, 6528, 6527, 6527, 6528, 6528] + - [24, 19843.0] + - - [6529, 6528, 1, 6528, 6529, 6529, 6528, 6528] + - [22, 19532.0] + - - [6528, 6527, 1, 6528, 6528, 6528, 6528, 6528] + - [43, 19830.0] + - - [6528, 6529, 1, 6528, 6528, 6528, 6528, 6528] + - [24, 19850.0] + - - [6528, 6528, 1, 6527, 6528, 6528, 6527, 6527] + - [24, 19777.0] + - - [6528, 6528, 1, 6529, 6528, 6528, 6529, 6529] + - [7, 19775.0] + - - [6911, 6912, 1, 6912, 6911, 6911, 6912, 6912] + - [32, 20280.0] + - - [6913, 6912, 1, 6912, 6913, 6913, 6912, 6912] + - [24, 19838.0] + - - [6912, 6911, 1, 6912, 6912, 6912, 6912, 6912] + - [32, 20296.0] + - - [6912, 6913, 1, 6912, 6912, 6912, 6912, 6912] + - [30, 19532.0] + - - [6912, 6912, 1, 6911, 6912, 6912, 6911, 6911] + - [24, 20291.0] + - - [6912, 6912, 1, 6913, 6912, 6912, 6913, 6913] + - [7, 20294.0] + - - [7295, 7296, 1, 7296, 7295, 7295, 7296, 7296] + - [30, 19853.0] + - - [7297, 7296, 1, 7296, 7297, 7297, 7296, 7296] + - [24, 19692.0] + - - [7296, 7295, 1, 7296, 7296, 7296, 7296, 7296] + - [41, 19864.0] + - - [7296, 7297, 1, 7296, 7296, 7296, 7296, 7296] + - [43, 19870.0] + - - [7296, 7296, 1, 7295, 7296, 7296, 7295, 7295] + - [37, 19984.0] + - - [7296, 7296, 1, 7297, 7296, 7296, 7297, 7297] + - [5, 19835.0] + - - [7679, 7680, 1, 7680, 7679, 7679, 7680, 7680] + - [51, 20387.0] + - - [7681, 7680, 1, 7680, 7681, 7681, 7680, 7680] + - [51, 19824.0] + - - [7680, 7679, 1, 7680, 7680, 7680, 7680, 7680] + - [51, 20357.0] + - - [7680, 7681, 1, 7680, 7680, 7680, 7680, 7680] + - [51, 19607.0] + - - [7680, 7680, 1, 7679, 7680, 7680, 7679, 7679] + - [7, 20457.0] + - - [7680, 7680, 1, 7681, 7680, 7680, 7681, 7681] + - [24, 20462.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [40, 13541.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [32, 19170.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [18, 17140.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [51, 18098.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [22, 18425.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [41, 19097.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [51, 19335.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [41, 19440.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [32, 20425.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [24, 19687.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [7, 20085.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [7, 19854.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [51, 19638.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [43, 19853.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [51, 20296.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [13, 19863.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [51, 20342.0] + - - [256, 128, 49, 1152, 256, 256, 1152, 1152] + - [13, 15675.0] + - - [256, 128, 121, 120, 256, 256, 120, 120] + - [2, 15133.0] + - - [256, 128, 169, 120, 256, 256, 120, 120] + - [26, 16075.0] + - - [256, 128, 36, 120, 256, 256, 120, 120] + - [17, 12203.0] + - - [256, 128, 49, 120, 256, 256, 120, 120] + - [17, 12897.0] + - - [256, 128, 64, 120, 256, 256, 120, 120] + - [18, 13722.0] + - - [256, 128, 36, 12000, 256, 256, 12000, 12000] + - [19, 16275.0] + - - [256, 128, 49, 1216, 256, 256, 1216, 1216] + - [18, 16465.0] + - - [256, 128, 121, 18, 256, 256, 18, 18] + - [8, 5620.0] + - - [256, 128, 169, 18, 256, 256, 18, 18] + - [4, 6932.0] + - - [256, 128, 36, 18, 256, 256, 18, 18] + - [0, 4147.0] + - - [256, 128, 49, 18, 256, 256, 18, 18] + - [0, 4817.0] + - - [256, 128, 64, 18, 256, 256, 18, 18] + - [8, 5302.0] + - - [256, 128, 36, 1800, 256, 256, 1800, 1800] + - [47, 17560.0] + - - [256, 128, 121, 19, 256, 256, 19, 19] + - [4, 6461.0] + - - [256, 128, 169, 19, 256, 256, 19, 19] + - [4, 7197.0] + - - [256, 128, 36, 19, 256, 256, 19, 19] + - [8, 4277.0] + - - [256, 128, 49, 19, 256, 256, 19, 19] + - [8, 5034.0] + - - [256, 128, 64, 19, 256, 256, 19, 19] + - [0, 5519.0] + - - [256, 128, 36, 1900, 256, 256, 1900, 1900] + - [19, 17590.0] + - - [256, 128, 49, 480, 256, 256, 480, 480] + - [37, 15754.0] + - - [256, 128, 81, 480, 256, 256, 480, 480] + - [22, 16197.0] + - - [256, 128, 64, 5880, 256, 256, 5880, 5880] + - [23, 14384.0] + - - [256, 128, 49, 72, 256, 256, 72, 72] + - [36, 10231.0] + - - [256, 128, 81, 72, 256, 256, 72, 72] + - [38, 11811.0] + - - [256, 128, 49, 76, 256, 256, 76, 76] + - [0, 10538.0] + - - [256, 128, 81, 76, 256, 256, 76, 76] + - [36, 11059.0] + - - [256, 128, 49, 7680, 256, 256, 7680, 7680] + - [35, 12001.0] + - - [256, 128, 64, 882, 256, 256, 882, 882] + - [18, 16462.0] + - - [256, 128, 64, 931, 256, 256, 931, 931] + - [37, 16693.0] + - - [256, 256, 49, 1152, 256, 256, 1152, 1152] + - [15, 17389.0] + - - [256, 256, 36, 12000, 256, 256, 12000, 12000] + - [7, 19742.0] + - - [256, 256, 49, 1216, 256, 256, 1216, 1216] + - [7, 17515.0] + - - [256, 256, 36, 1800, 256, 256, 1800, 1800] + - [7, 19033.0] + - - [256, 256, 36, 1900, 256, 256, 1900, 1900] + - [43, 19072.0] + - - [256, 256, 64, 5880, 256, 256, 5880, 5880] + - [24, 17854.0] + - - [256, 256, 49, 7680, 256, 256, 7680, 7680] + - [34, 14373.0] + - - [256, 256, 64, 882, 256, 256, 882, 882] + - [1, 17135.0] + - - [256, 256, 64, 931, 256, 256, 931, 931] + - [1, 17128.0] + - - [340, 256, 49, 1152, 340, 340, 1152, 1152] + - [49, 15174.0] + - - [340, 256, 36, 120, 340, 340, 120, 120] + - [36, 12816.0] + - - [340, 256, 49, 120, 340, 340, 120, 120] + - [38, 13810.0] + - - [340, 256, 64, 120, 340, 340, 120, 120] + - [38, 14482.0] + - - [340, 256, 36, 12000, 340, 340, 12000, 12000] + - [24, 17698.0] + - - [340, 256, 49, 1216, 340, 340, 1216, 1216] + - [1, 15507.0] + - - [340, 256, 36, 18, 340, 340, 18, 18] + - [36, 4215.0] + - - [340, 256, 49, 18, 340, 340, 18, 18] + - [19, 5244.0] + - - [340, 256, 64, 18, 340, 340, 18, 18] + - [45, 5473.0] + - - [340, 256, 36, 1800, 340, 340, 1800, 1800] + - [7, 17136.0] + - - [340, 256, 36, 19, 340, 340, 19, 19] + - [25, 4423.0] + - - [340, 256, 49, 19, 340, 340, 19, 19] + - [25, 5381.0] + - - [340, 256, 64, 19, 340, 340, 19, 19] + - [25, 5734.0] + - - [340, 256, 36, 1900, 340, 340, 1900, 1900] + - [7, 17127.0] + - - [340, 256, 64, 5880, 340, 340, 5880, 5880] + - [18, 17184.0] + - - [340, 256, 49, 7680, 340, 340, 7680, 7680] + - [16, 13056.0] + - - [340, 256, 64, 882, 340, 340, 882, 882] + - [18, 16581.0] + - - [340, 256, 64, 931, 340, 340, 931, 931] + - [37, 16558.0] + - - [510, 256, 49, 120, 510, 510, 120, 120] + - [47, 14307.0] + - - [510, 256, 64, 120, 510, 510, 120, 120] + - [19, 15257.0] + - - [510, 256, 49, 18, 510, 510, 18, 18] + - [38, 4300.0] + - - [510, 256, 64, 18, 510, 510, 18, 18] + - [36, 4500.0] + - - [510, 256, 49, 19, 510, 510, 19, 19] + - [25, 4472.0] + - - [510, 256, 64, 19, 510, 510, 19, 19] + - [17, 4731.0] + - - [510, 256, 36, 480, 510, 510, 480, 480] + - [1, 18556.0] + - - [510, 256, 36, 72, 510, 510, 72, 72] + - [19, 11333.0] + - - [510, 256, 36, 76, 510, 510, 76, 76] + - [50, 11219.0] + - - [510, 512, 36, 1080, 510, 510, 1080, 1080] + - [37, 19654.0] + - - [510, 512, 36, 162, 510, 510, 162, 162] + - [38, 16228.0] + - - [510, 512, 36, 171, 510, 510, 171, 171] + - [13, 16551.0] + - - [510, 512, 49, 1920, 510, 510, 1920, 1920] + - [51, 19508.0] + - - [510, 512, 64, 1920, 510, 510, 1920, 1920] + - [22, 19238.0] + - - [510, 512, 49, 288, 510, 510, 288, 288] + - [22, 18379.0] + - - [510, 512, 64, 288, 510, 510, 288, 288] + - [1, 18741.0] + - - [510, 512, 36, 3000, 510, 510, 3000, 3000] + - [24, 19964.0] + - - [510, 512, 49, 304, 510, 510, 304, 304] + - [37, 18380.0] + - - [510, 512, 64, 304, 510, 510, 304, 304] + - [18, 18794.0] + - - [510, 512, 36, 450, 510, 510, 450, 450] + - [22, 18521.0] + - - [510, 512, 36, 475, 510, 510, 475, 475] + - [41, 18714.0] + - - [510, 512, 49, 480, 510, 510, 480, 480] + - [1, 19363.0] + - - [510, 512, 64, 480, 510, 510, 480, 480] + - [1, 19005.0] + - - [510, 512, 49, 72, 510, 510, 72, 72] + - [19, 14199.0] + - - [510, 512, 64, 72, 510, 510, 72, 72] + - [50, 14699.0] + - - [510, 512, 49, 76, 510, 510, 76, 76] + - [23, 14887.0] + - - [510, 512, 64, 76, 510, 510, 76, 76] + - [23, 15298.0] + - - [512, 256, 81, 1080, 512, 512, 1080, 1080] + - [37, 19625.0] + - - [512, 256, 25, 12000, 512, 512, 12000, 12000] + - [7, 18588.0] + - - [512, 256, 81, 162, 512, 512, 162, 162] + - [1, 17529.0] + - - [512, 256, 81, 171, 512, 512, 171, 171] + - [5, 18108.0] + - - [512, 256, 25, 1800, 512, 512, 1800, 1800] + - [7, 18044.0] + - - [512, 256, 25, 1900, 512, 512, 1900, 1900] + - [43, 18087.0] + - - [512, 256, 121, 1920, 512, 512, 1920, 1920] + - [5, 19364.0] + - - [512, 256, 169, 1920, 512, 512, 1920, 1920] + - [32, 19790.0] + - - [512, 256, 49, 1920, 512, 512, 1920, 1920] + - [41, 18615.0] + - - [512, 256, 121, 288, 512, 512, 288, 288] + - [22, 17608.0] + - - [512, 256, 169, 288, 512, 512, 288, 288] + - [5, 17602.0] + - - [512, 256, 49, 288, 512, 512, 288, 288] + - [1, 18684.0] + - - [512, 256, 25, 3000, 512, 512, 3000, 3000] + - [7, 18280.0] + - - [512, 256, 81, 3000, 512, 512, 3000, 3000] + - [7, 20191.0] + - - [512, 256, 121, 304, 512, 512, 304, 304] + - [22, 18175.0] + - - [512, 256, 169, 304, 512, 512, 304, 304] + - [5, 17655.0] + - - [512, 256, 49, 304, 512, 512, 304, 304] + - [1, 18702.0] + - - [512, 256, 25, 450, 512, 512, 450, 450] + - [1, 17202.0] + - - [512, 256, 81, 450, 512, 512, 450, 450] + - [5, 18769.0] + - - [512, 256, 25, 475, 512, 512, 475, 475] + - [37, 17048.0] + - - [512, 256, 81, 475, 512, 512, 475, 475] + - [1, 18694.0] + - - [512, 256, 121, 480, 512, 512, 480, 480] + - [41, 18941.0] + - - [512, 256, 169, 480, 512, 512, 480, 480] + - [41, 19126.0] + - - [512, 256, 49, 5880, 512, 512, 5880, 5880] + - [37, 19611.0] + - - [512, 256, 121, 72, 512, 512, 72, 72] + - [1, 16734.0] + - - [512, 256, 169, 72, 512, 512, 72, 72] + - [9, 17339.0] + - - [512, 256, 121, 76, 512, 512, 76, 76] + - [19, 16123.0] + - - [512, 256, 169, 76, 512, 512, 76, 76] + - [13, 16609.0] + - - [512, 256, 49, 882, 512, 512, 882, 882] + - [1, 18986.0] + - - [512, 256, 49, 931, 512, 512, 931, 931] + - [1, 19055.0] + - - [2304, 512, 1, 100, 2304, 2304, 100, 100] + - [38, 11387.0] + - - [2304, 512, 1, 361, 2304, 2304, 361, 361] + - [38, 16291.0] + - - [4608, 510, 1, 100, 4608, 4608, 100, 100] + - [0, 13013.0] + - - [4608, 510, 1, 361, 4608, 4608, 361, 361] + - [1, 16674.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [32, 18161.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 18714.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 18000.0] + - - [30522, 616, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 18312.0] + - - [128, 128, 128, 64, 128, 128, 64, 64] + - [19, 10585.0] + - - [128, 128, 160, 64, 128, 128, 64, 64] + - [6, 11170.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [29, 13497.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 13443.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18665.0] + - - [30522, 200, 1, 1024, 30522, 30522, 1024, 1024] + - [15, 14225.0] + - - [128, 128, 624, 64, 128, 128, 64, 64] + - [27, 13564.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18281.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17484.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19421.0] + - - [30522, 780, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 16907.0] + - - [30522, 308, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 15403.0] + - - [128, 128, 640, 64, 128, 128, 64, 64] + - [10, 14340.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18728.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17935.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19653.0] + - - [30522, 800, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 17321.0] + - - [128, 128, 656, 64, 128, 128, 64, 64] + - [4, 13594.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17769.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18297.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19409.0] + - - [30522, 820, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 17733.0] + - - [512, 512, 80, 64, 512, 512, 64, 64] + - [38, 17287.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 16430.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 16005.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19493.0] + - - [30522, 385, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 14425.0] + - - [30522, 462, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 17229.0] + - - [128, 128, 144, 64, 128, 128, 64, 64] + - [50, 10216.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 16232.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 17033.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 18700.0] + - - [30522, 180, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 14579.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18297.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17515.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 18974.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18264.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19459.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19262.0] + - - [33712, 8192, 1, 1024, 33712, 33712, 1024, 1024] + - [51, 20159.0] + - - [33712, 9600, 1, 1024, 33712, 33712, 1024, 1024] + - [51, 19866.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19104.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19115.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19131.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19039.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 18265.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18735.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18034.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17804.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 19149.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17565.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19658.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19624.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19157.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19166.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19349.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19348.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19435.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19339.0] + - - [42720, 10080, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 19970.0] + - - [42720, 6528, 1, 1024, 42720, 42720, 1024, 1024] + - [49, 19911.0] + - - [42720, 7104, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 19948.0] + - - [1024, 32768, 1, 480, 1024, 1024, 480, 480] + - [1, 20231.0] + - - [30592, 1024, 1, 2048, 30592, 30592, 2048, 2048] + - [49, 19501.0] + - - [6144, 1024, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 18696.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 18520.0] + - - [30592, 8192, 1, 1024, 30592, 30592, 1024, 1024] + - [51, 20172.0] + - - [3072, 8192, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19542.0] + - - [512, 512, 256, 64, 512, 512, 64, 64] + - [0, 9528.0] + - - [30592, 2048, 1, 1024, 30592, 30592, 1024, 1024] + - [51, 19830.0] + - - [30592, 4096, 1, 1024, 30592, 30592, 1024, 1024] + - [51, 20085.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 1024, 1024] + - [32, 19328.0] + - - [1920, 2048, 1, 2560, 1920, 1920, 2560, 2560] + - [13, 18172.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [30, 19030.0] + - - [2560, 2048, 1, 640, 2560, 2560, 640, 640] + - [41, 18775.0] + - - [7680, 2048, 1, 2560, 7680, 7680, 2560, 2560] + - [13, 19492.0] + - - [512, 512, 40, 64, 512, 512, 64, 64] + - [38, 16416.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 18745.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 6144, 6144] + - [13, 18351.0] + - - [4608, 4096, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 20255.0] + - - [50304, 4096, 1, 1536, 50304, 50304, 1536, 1536] + - [32, 20338.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 1536, 1536] + - [13, 19566.0] + - - [1024, 1024, 64, 96, 1024, 1024, 96, 96] + - [0, 14231.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [32, 19578.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 6144, 6144] + - [32, 19582.0] + - - [4608, 8192, 1, 1536, 4608, 4608, 1536, 1536] + - [32, 20204.0] + - - [50304, 8192, 1, 1536, 50304, 50304, 1536, 1536] + - [32, 20361.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 1536, 1536] + - [15, 19831.0] + - - [1024, 1024, 128, 96, 1024, 1024, 96, 96] + - [0, 13912.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 19250.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18892.0] + - - [3072, 16384, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19543.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19456.0] + - - [50304, 16384, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 20258.0] + - - [1024, 1024, 256, 64, 1024, 1024, 64, 64] + - [12, 10691.0] + - - [50304, 2048, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 20118.0] + - - [1024, 1024, 32, 64, 1024, 1024, 64, 64] + - [0, 16076.0] + - - [50304, 4096, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 20222.0] + - - [1024, 1024, 64, 64, 1024, 1024, 64, 64] + - [0, 13035.0] + - - [50304, 8192, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 20264.0] + - - [1024, 1024, 128, 64, 1024, 1024, 64, 64] + - [0, 11171.0] + - - [30528, 8192, 1, 1024, 30528, 30528, 1024, 1024] + - [51, 20141.0] + - - [128, 128, 1024, 64, 128, 128, 64, 64] + - [14, 15174.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18299.0] + - - [1024, 3456, 1, 480, 1024, 1024, 480, 480] + - [1, 18988.0] + - - [512, 3456, 1, 1024, 512, 512, 1024, 1024] + - [14, 16874.0] + - - [512, 3456, 1, 13, 512, 512, 13, 13] + - [31, 4213.0] + - - [512, 4096, 1, 13, 512, 512, 13, 13] + - [31, 4574.0] + - - [512, 6912, 1, 13, 512, 512, 13, 13] + - [36, 5181.0] + - - [30528, 640, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 19098.0] + - - [30528, 1280, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 19589.0] + - - [30528, 1600, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 18903.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19516.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19406.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19280.0] + - - [128, 128, 1280, 64, 128, 128, 64, 64] + - [0, 6991.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18349.0] + - - [30528, 1640, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 19400.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19328.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 18690.0] + - - [128, 128, 1312, 64, 128, 128, 64, 64] + - [0, 7555.0] + - - [30528, 160, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 13148.0] + - - [30528, 240, 1, 1024, 30528, 30528, 1024, 1024] + - [30, 17023.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 18493.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19502.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18010.0] + - - [512, 512, 192, 64, 512, 512, 64, 64] + - [0, 12676.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19356.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19416.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19476.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19417.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19312.0] + - - [3072, 10224, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19391.0] + - - [3072, 10240, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19373.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19264.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19321.0] + - - [3072, 10192, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19247.0] + - - [3072, 10200, 1, 1024, 3072, 3072, 1024, 1024] + - [32, 19334.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19294.0] + - - [3072, 10208, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19322.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18706.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19245.0] + - - [2048, 10224, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 19673.0] + - - [2048, 10240, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 19805.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19213.0] + - - [2048, 10192, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 19623.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19248.0] + - - [3072, 10080, 1, 1024, 3072, 3072, 1024, 1024] + - [13, 19345.0] + - - [256, 256, 25, 12544, 256, 256, 12544, 12544] + - [13, 15324.0] + - - [256, 256, 49, 3200, 256, 256, 3200, 3200] + - [32, 17155.0] + - - [256, 256, 25, 6272, 256, 256, 6272, 6272] + - [30, 16438.0] + - - [256, 256, 49, 6400, 256, 256, 6400, 6400] + - [32, 15575.0] + - - [512, 512, 49, 1152, 512, 512, 1152, 1152] + - [24, 19487.0] + - - [512, 512, 25, 2048, 512, 512, 2048, 2048] + - [13, 15352.0] + - - [512, 512, 49, 2304, 512, 512, 2304, 2304] + - [51, 19843.0] + - - [512, 512, 25, 4096, 512, 512, 4096, 4096] + - [53, 14616.0] + - - [128, 128, 2048, 64, 128, 128, 64, 64] + - [0, 6359.0] + - - [30528, 2560, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 19810.0] + - - [128, 128, 1536, 64, 128, 128, 64, 64] + - [0, 6196.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19337.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19251.0] + - - [30528, 1920, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 19728.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19329.0] + - - [128, 128, 81, 12544, 128, 128, 12544, 12544] + - [33, 10556.0] + - - [128, 128, 121, 9216, 128, 128, 9216, 9216] + - [52, 9078.0] + - - [128, 128, 169, 6400, 128, 128, 6400, 6400] + - [52, 11285.0] + - - [256, 256, 36, 4096, 256, 256, 4096, 4096] + - [34, 12639.0] + - - [256, 256, 49, 2304, 256, 256, 2304, 2304] + - [15, 16543.0] + - - [256, 256, 64, 2304, 256, 256, 2304, 2304] + - [51, 15969.0] + - - [256, 256, 81, 4096, 256, 256, 4096, 4096] + - [53, 11935.0] + - - [256, 256, 121, 2304, 256, 256, 2304, 2304] + - [32, 16466.0] + - - [256, 256, 169, 2304, 256, 256, 2304, 2304] + - [32, 16663.0] + - - [512, 512, 81, 1024, 512, 512, 1024, 1024] + - [49, 17047.0] + - - [512, 512, 121, 1024, 512, 512, 1024, 1024] + - [13, 17177.0] + - - [512, 512, 169, 1024, 512, 512, 1024, 1024] + - [49, 17467.0] + - - [512, 512, 36, 1024, 512, 512, 1024, 1024] + - [13, 16444.0] + - - [512, 512, 49, 1024, 512, 512, 1024, 1024] + - [30, 15965.0] + - - [512, 512, 64, 1024, 512, 512, 1024, 1024] + - [30, 16374.0] + - - [128, 128, 192, 64, 128, 128, 64, 64] + - [42, 10356.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [49, 15475.0] + - - [3072, 2048, 1, 768, 3072, 3072, 768, 768] + - [30, 18584.0] + - - [768, 2048, 1, 3072, 768, 768, 3072, 3072] + - [49, 16011.0] + - - [384, 384, 144, 64, 384, 384, 64, 64] + - [18, 17741.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [32, 19001.0] + - - [3072, 4608, 1, 768, 3072, 3072, 768, 768] + - [15, 19833.0] + - - [768, 4608, 1, 3072, 768, 768, 3072, 3072] + - [32, 19748.0] + - - [512, 512, 48, 64, 512, 512, 64, 64] + - [38, 16975.0] + - - [128, 128, 256, 64, 128, 128, 64, 64] + - [21, 12509.0] + - - [384, 384, 192, 64, 384, 384, 64, 64] + - [0, 14109.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19062.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 19972.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19941.0] + - - [256, 256, 36, 432, 256, 256, 432, 432] + - [1, 17994.0] + - - [256, 256, 36, 456, 256, 256, 456, 456] + - [1, 18259.0] + - - [256, 256, 36, 504, 256, 256, 504, 504] + - [1, 18361.0] + - - [256, 256, 49, 1120, 256, 256, 1120, 1120] + - [37, 17558.0] + - - [256, 256, 36, 442, 256, 256, 442, 442] + - [1, 17415.0] + - - [256, 256, 49, 950, 256, 256, 950, 950] + - [18, 17449.0] + - - [256, 256, 64, 616, 256, 256, 616, 616] + - [1, 17352.0] + - - [256, 256, 64, 660, 256, 256, 660, 660] + - [1, 17254.0] + - - [256, 256, 36, 408, 256, 256, 408, 408] + - [1, 17979.0] + - - [256, 256, 49, 1008, 256, 256, 1008, 1008] + - [37, 17680.0] + - - [256, 256, 36, 462, 256, 256, 462, 462] + - [1, 17834.0] + - - [256, 256, 36, 468, 256, 256, 468, 468] + - [1, 18018.0] + - - [256, 256, 36, 494, 256, 256, 494, 494] + - [1, 18081.0] + - - [512, 512, 64, 48, 512, 512, 48, 48] + - [38, 16308.0] + - - [256, 256, 64, 140, 256, 256, 140, 140] + - [38, 15205.0] + - - [512, 512, 64, 56, 512, 512, 56, 56] + - [27, 17076.0] + - - [512, 512, 49, 90, 512, 512, 90, 90] + - [19, 17041.0] + - - [512, 512, 49, 60, 512, 512, 60, 60] + - [19, 16003.0] + - - [256, 256, 49, 864, 256, 256, 864, 864] + - [43, 17415.0] + - - [256, 256, 64, 224, 256, 256, 224, 224] + - [17, 16460.0] + - - [256, 256, 64, 176, 256, 256, 176, 176] + - [1, 16267.0] + - - [256, 256, 64, 154, 256, 256, 154, 154] + - [1, 15632.0] + - - [512, 512, 49, 80, 512, 512, 80, 80] + - [1, 17748.0] + - - [256, 256, 49, 1200, 256, 256, 1200, 1200] + - [18, 17575.0] + - - [256, 256, 64, 704, 256, 256, 704, 704] + - [37, 17234.0] + - - [256, 256, 64, 768, 256, 256, 768, 768] + - [51, 17118.0] + - - [256, 256, 49, 1160, 256, 256, 1160, 1160] + - [20, 17350.0] + - - [256, 256, 49, 320, 256, 256, 320, 320] + - [18, 16270.0] + - - [512, 512, 49, 70, 512, 512, 70, 70] + - [19, 16301.0] + - - [256, 256, 49, 1240, 256, 256, 1240, 1240] + - [24, 17419.0] + - - [256, 256, 36, 384, 256, 256, 384, 384] + - [32, 16544.0] + - - [1024, 2048, 1, 888, 1024, 1024, 888, 888] + - [24, 17041.0] + - - [1024, 2048, 1, 713, 1024, 1024, 713, 713] + - [24, 16835.0] + - - [1024, 2048, 1, 660, 1024, 1024, 660, 660] + - [20, 16757.0] + - - [1024, 2048, 1, 726, 1024, 1024, 726, 726] + - [24, 16783.0] + - - [1024, 2048, 1, 672, 1024, 1024, 672, 672] + - [24, 16898.0] + - - [1024, 2048, 1, 850, 1024, 1024, 850, 850] + - [24, 16945.0] + - - [1024, 2048, 1, 805, 1024, 1024, 805, 805] + - [24, 16848.0] + - - [1024, 2048, 1, 864, 1024, 1024, 864, 864] + - [24, 17100.0] + - - [1024, 2048, 1, 768, 1024, 1024, 768, 768] + - [32, 16795.0] + - - [1024, 2048, 1, 950, 1024, 1024, 950, 950] + - [24, 17057.0] + - - [1024, 1024, 160, 96, 1024, 1024, 96, 96] + - [0, 14626.0] + - - [2880, 16384, 1, 1920, 2880, 2880, 1920, 1920] + - [7, 19741.0] + - - [1920, 16384, 1, 960, 1920, 1920, 960, 960] + - [37, 20263.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 1920, 1920] + - [24, 20284.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 3840, 3840] + - [24, 20238.0] + - - [25216, 16384, 1, 1920, 25216, 25216, 1920, 1920] + - [24, 20407.0] + - - [1024, 1024, 40, 96, 1024, 1024, 96, 96] + - [0, 17769.0] + - - [2880, 4096, 1, 1920, 2880, 2880, 1920, 1920] + - [22, 18736.0] + - - [1920, 4096, 1, 960, 1920, 1920, 960, 960] + - [37, 19190.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 1920, 1920] + - [5, 19518.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 3840, 3840] + - [51, 19275.0] + - - [25216, 4096, 1, 1920, 25216, 25216, 1920, 1920] + - [24, 20333.0] + - - [1024, 1024, 80, 96, 1024, 1024, 96, 96] + - [0, 15891.0] + - - [2880, 8192, 1, 1920, 2880, 2880, 1920, 1920] + - [24, 19465.0] + - - [1920, 8192, 1, 960, 1920, 1920, 960, 960] + - [37, 20110.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 1920, 1920] + - [24, 20167.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 3840, 3840] + - [30, 19577.0] + - - [25216, 8192, 1, 1920, 25216, 25216, 1920, 1920] + - [43, 20374.0] + - - [1024, 1024, 96, 96, 1024, 1024, 96, 96] + - [0, 15273.0] + - - [1728, 16384, 1, 2304, 1728, 1728, 2304, 2304] + - [43, 19439.0] + - - [2304, 16384, 1, 576, 2304, 2304, 576, 576] + - [1, 20330.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [51, 20403.0] + - - [12672, 16384, 1, 2304, 12672, 12672, 2304, 2304] + - [7, 20410.0] + - - [1024, 1024, 24, 96, 1024, 1024, 96, 96] + - [9, 18510.0] + - - [1728, 4096, 1, 2304, 1728, 1728, 2304, 2304] + - [13, 17974.0] + - - [2304, 4096, 1, 576, 2304, 2304, 576, 576] + - [37, 20024.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [32, 20210.0] + - - [12672, 4096, 1, 2304, 12672, 12672, 2304, 2304] + - [7, 20094.0] + - - [1024, 1024, 48, 96, 1024, 1024, 96, 96] + - [0, 17452.0] + - - [1728, 8192, 1, 2304, 1728, 1728, 2304, 2304] + - [13, 18897.0] + - - [2304, 8192, 1, 576, 2304, 2304, 576, 576] + - [37, 20245.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [24, 20289.0] + - - [12672, 8192, 1, 2304, 12672, 12672, 2304, 2304] + - [24, 20332.0] + - - [1024, 1024, 16, 96, 1024, 1024, 96, 96] + - [1, 18117.0] + - - [1152, 4096, 1, 3072, 1152, 1152, 3072, 3072] + - [32, 19930.0] + - - [3072, 4096, 1, 384, 3072, 3072, 384, 384] + - [24, 18925.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 3072, 3072] + - [49, 18745.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 1536, 1536] + - [15, 19553.0] + - - [6400, 4096, 1, 3072, 6400, 6400, 3072, 3072] + - [15, 19734.0] + - - [1024, 1024, 32, 96, 1024, 1024, 96, 96] + - [0, 17822.0] + - - [1152, 8192, 1, 3072, 1152, 1152, 3072, 3072] + - [51, 20211.0] + - - [3072, 8192, 1, 384, 3072, 3072, 384, 384] + - [5, 19369.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 3072, 3072] + - [51, 19657.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 1536, 1536] + - [30, 19552.0] + - - [6400, 8192, 1, 3072, 6400, 6400, 3072, 3072] + - [32, 20133.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 18467.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 4096, 4096] + - [13, 18331.0] + - - [29000, 199, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 13146.0] + - - [29000, 221, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 13606.0] + - - [29000, 224, 1, 2048, 29000, 29000, 2048, 2048] + - [32, 14244.0] + - - [29000, 229, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 14120.0] + - - [29000, 234, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 14945.0] + - - [29000, 242, 1, 2048, 29000, 29000, 2048, 2048] + - [32, 15601.0] + - - [29000, 246, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 15469.0] + - - [29000, 247, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 15508.0] + - - [29000, 256, 1, 2048, 29000, 29000, 2048, 2048] + - [32, 15877.0] + - - [29000, 262, 1, 2048, 29000, 29000, 2048, 2048] + - [22, 11110.0] + - - [29000, 264, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 11206.0] + - - [29000, 265, 1, 2048, 29000, 29000, 2048, 2048] + - [48, 11161.0] + - - [29000, 274, 1, 2048, 29000, 29000, 2048, 2048] + - [48, 11395.0] + - - [29000, 277, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 11691.0] + - - [29000, 279, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 11697.0] + - - [29000, 288, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 12225.0] + - - [29000, 296, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 12142.0] + - - [29000, 315, 1, 2048, 29000, 29000, 2048, 2048] + - [40, 13144.0] + - - [29000, 335, 1, 2048, 29000, 29000, 2048, 2048] + - [30, 13907.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 2048, 2048] + - [13, 19427.0] + - - [29000, 2283, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19588.0] + - - [29000, 2296, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 19841.0] + - - [29000, 2306, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18796.0] + - - [29000, 2309, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18809.0] + - - [29000, 2318, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18890.0] + - - [29000, 2320, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18911.0] + - - [29000, 2324, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18937.0] + - - [29000, 2325, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18941.0] + - - [29000, 2329, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18972.0] + - - [29000, 2338, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19049.0] + - - [29000, 2345, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19107.0] + - - [29000, 2350, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19149.0] + - - [29000, 2362, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19242.0] + - - [29000, 2366, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19268.0] + - - [29000, 2368, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19295.0] + - - [29000, 2374, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19341.0] + - - [29000, 2390, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19471.0] + - - [512, 512, 320, 64, 512, 512, 64, 64] + - [0, 10600.0] + - - [29000, 561, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 16888.0] + - - [29000, 574, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 17260.0] + - - [29000, 600, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 17998.0] + - - [29000, 608, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18247.0] + - - [29000, 615, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18449.0] + - - [29000, 622, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18660.0] + - - [29000, 625, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18717.0] + - - [29000, 626, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18799.0] + - - [29000, 628, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 18838.0] + - - [29000, 636, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19036.0] + - - [29000, 651, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 16597.0] + - - [29000, 658, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 16688.0] + - - [29000, 669, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 17054.0] + - - [29000, 670, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 17043.0] + - - [29000, 672, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 17086.0] + - - [29000, 684, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 17421.0] + - - [29000, 716, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 18222.0] + - - [29000, 730, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 18538.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [49, 16600.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 4096, 4096] + - [49, 16362.0] + - - [1024, 1024, 512, 64, 1024, 1024, 64, 64] + - [12, 10702.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19878.0] + - - [3072, 32768, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 19745.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19595.0] + - - [50304, 32768, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 19895.0] + - - [1024, 1024, 24, 128, 1024, 1024, 128, 128] + - [5, 18564.0] + - - [128, 1024, 24, 1024, 128, 128, 1024, 1024] + - [51, 16997.0] + - - [4096, 256, 1, 12288, 4096, 4096, 12288, 12288] + - [34, 13832.0] + - - [2048, 256, 1, 13312, 2048, 2048, 13312, 13312] + - [56, 16121.0] + - - [4096, 256, 1, 15360, 4096, 4096, 15360, 15360] + - [53, 13795.0] + - - [2048, 512, 1, 16640, 2048, 2048, 16640, 16640] + - [67, 16976.0] + - - [4096, 256, 1, 14336, 4096, 4096, 14336, 14336] + - [53, 13864.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 8192, 8192] + - [51, 16419.0] + - - [1024, 512, 1, 16384, 1024, 1024, 16384, 16384] + - [56, 15075.0] + - - [4096, 256, 1, 9216, 4096, 4096, 9216, 9216] + - [67, 14265.0] + - - [1024, 512, 1, 12288, 1024, 1024, 12288, 12288] + - [56, 15704.0] + - - [4096, 200, 1, 12288, 4096, 4096, 12288, 12288] + - [59, 10888.0] + - - [1024, 1024, 1, 13312, 1024, 1024, 13312, 13312] + - [56, 16732.0] + - - [2048, 256, 1, 16384, 2048, 2048, 16384, 16384] + - [67, 13791.0] + - - [2048, 512, 1, 16384, 2048, 2048, 16384, 16384] + - [60, 14879.0] + - - [1024, 1024, 1, 8320, 1024, 1024, 8320, 8320] + - [24, 16450.0] + - - [2048, 256, 1, 14336, 2048, 2048, 14336, 14336] + - [60, 16012.0] + - - [4096, 200, 1, 16640, 4096, 4096, 16640, 16640] + - [58, 13084.0] + - - [1024, 1024, 1, 16640, 1024, 1024, 16640, 16640] + - [56, 16964.0] + - - [1024, 1024, 1, 14336, 1024, 1024, 14336, 14336] + - [56, 16784.0] + - - [2048, 512, 1, 9216, 2048, 2048, 9216, 9216] + - [32, 16439.0] + - - [1024, 1024, 1, 15360, 1024, 1024, 15360, 15360] + - [56, 16854.0] + - - [2048, 512, 1, 8192, 2048, 2048, 8192, 8192] + - [32, 16419.0] + - - [2048, 512, 1, 13312, 2048, 2048, 13312, 13312] + - [60, 16738.0] + - - [1024, 1024, 1, 11264, 1024, 1024, 11264, 11264] + - [56, 16539.0] + - - [1024, 512, 1, 16640, 1024, 1024, 16640, 16640] + - [55, 16275.0] + - - [2048, 512, 1, 10240, 2048, 2048, 10240, 10240] + - [51, 16564.0] + - - [2048, 256, 1, 16640, 2048, 2048, 16640, 16640] + - [56, 16246.0] + - - [4096, 256, 1, 13312, 4096, 4096, 13312, 13312] + - [66, 13811.0] + - - [4096, 200, 1, 15360, 4096, 4096, 15360, 15360] + - [59, 11097.0] + - - [2048, 512, 1, 12288, 2048, 2048, 12288, 12288] + - [56, 16640.0] + - - [4096, 256, 1, 8192, 4096, 4096, 8192, 8192] + - [51, 16375.0] + - - [2048, 512, 1, 15360, 2048, 2048, 15360, 15360] + - [67, 16283.0] + - - [2048, 512, 1, 11264, 2048, 2048, 11264, 11264] + - [56, 16565.0] + - - [2048, 256, 1, 12288, 2048, 2048, 12288, 12288] + - [60, 16018.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 12288, 12288] + - [56, 16654.0] + - - [4096, 256, 1, 16384, 4096, 4096, 16384, 16384] + - [53, 13507.0] + - - [2048, 256, 1, 15360, 2048, 2048, 15360, 15360] + - [67, 16085.0] + - - [2048, 512, 1, 8320, 2048, 2048, 8320, 8320] + - [24, 16442.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 10240, 10240] + - [51, 16563.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 9216, 9216] + - [32, 16583.0] + - - [4096, 200, 1, 16384, 4096, 4096, 16384, 16384] + - [34, 10779.0] + - - [2048, 512, 1, 14336, 2048, 2048, 14336, 14336] + - [60, 16730.0] + - - [1024, 512, 1, 13312, 1024, 1024, 13312, 13312] + - [67, 15986.0] + - - [4096, 256, 1, 8320, 4096, 4096, 8320, 8320] + - [15, 16404.0] + - - [4096, 200, 1, 13312, 4096, 4096, 13312, 13312] + - [59, 11340.0] + - - [1024, 512, 1, 14336, 1024, 1024, 14336, 14336] + - [67, 16060.0] + - - [4096, 256, 1, 11264, 4096, 4096, 11264, 11264] + - [53, 13760.0] + - - [4096, 256, 1, 10240, 4096, 4096, 10240, 10240] + - [34, 13906.0] + - - [4096, 200, 1, 14336, 4096, 4096, 14336, 14336] + - [66, 10846.0] + - - [4096, 256, 1, 16640, 4096, 4096, 16640, 16640] + - [58, 16412.0] + - - [1024, 512, 1, 15360, 1024, 1024, 15360, 15360] + - [56, 16336.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 16384, 16384] + - [15, 16478.0] + - - [224, 192, 36, 10368, 224, 224, 10368, 10368] + - [4, 13422.0] + - - [320, 256, 9, 19584, 320, 320, 19584, 19584] + - [56, 15948.0] + - - [256, 256, 11, 13056, 256, 256, 13056, 13056] + - [65, 14487.0] + - - [320, 256, 9, 9792, 320, 320, 9792, 9792] + - [63, 15394.0] + - - [320, 256, 11, 13056, 320, 320, 13056, 13056] + - [60, 14304.0] + - - [256, 256, 9, 9792, 256, 256, 9792, 9792] + - [54, 17274.0] + - - [256, 224, 9, 19584, 256, 256, 19584, 19584] + - [56, 14830.0] + - - [256, 256, 9, 19584, 256, 256, 19584, 19584] + - [54, 16699.0] + - - [128, 128, 36, 12000, 128, 128, 12000, 12000] + - [47, 12604.0] + - - [128, 128, 49, 12800, 128, 128, 12800, 12800] + - [57, 10993.0] + - - [128, 128, 25, 25088, 128, 128, 25088, 25088] + - [64, 10361.0] + - - [128, 128, 49, 25600, 128, 128, 25600, 25600] + - [62, 9670.0] + - - [128, 128, 25, 50176, 128, 128, 50176, 50176] + - [61, 9649.0] + - - [128, 128, 36, 12544, 128, 128, 12544, 12544] + - [35, 12493.0] + - - [128, 128, 49, 9216, 128, 128, 9216, 9216] + - [68, 8975.0] + - - [1024, 1024, 1, 12544, 1024, 1024, 12544, 12544] + - [56, 16665.0] + - - [1024, 1000, 1, 12544, 1024, 1024, 12544, 12544] + - [56, 16333.0] + - - [1024, 512, 1, 1600, 1024, 1024, 1600, 1600] + - [121, 13747.0] + - - [2048, 512, 1, 100, 2048, 2048, 100, 100] + - [73, 10832.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [110, 11391.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [110, 13329.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 12573.0] + - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] + - [110, 13608.0] + - - [30522, 120, 1, 1024, 30522, 30522, 1024, 1024] + - [87, 15073.0] + - - [30522, 80, 1, 1024, 30522, 30522, 1024, 1024] + - [105, 10390.0] + - - [64, 128, 512, 128, 64, 64, 128, 128] + - [86, 10652.0] + - - [64, 512, 64, 512, 64, 64, 512, 512] + - [83, 10957.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [76, 8916.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [72, 5447.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [108, 14179.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [105, 12386.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [82, 11305.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 128] + - [97, 10863.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [136, 15377.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [108, 13620.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [136, 11029.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [110, 9477.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [85, 13293.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [103, 9043.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [129, 6953.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [133, 6790.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [85, 11127.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [87, 11497.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [110, 11275.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [105, 13550.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [102, 7286.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [106, 9090.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [108, 12914.0] + - - [704, 1024, 1, 128, 704, 704, 128, 128] + - [129, 9672.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [85, 10629.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [136, 13924.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [110, 8910.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [87, 9417.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [85, 10975.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [87, 13938.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 128] + - [130, 10028.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [110, 7915.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [102, 8552.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [103, 9477.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [106, 7931.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 128] + - [79, 7829.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [82, 13276.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [102, 8038.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [84, 7659.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [130, 10570.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [81, 9196.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 128] + - [80, 7162.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 128] + - [103, 10561.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [133, 7818.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [87, 11475.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [83, 9202.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [85, 12040.0] + - - [448, 1856, 1, 128, 448, 448, 128, 128] + - [83, 9984.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [85, 12122.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [85, 10975.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [136, 12760.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [105, 13725.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [105, 13483.0] + - - [704, 1856, 1, 128, 704, 704, 128, 128] + - [103, 11878.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 128] + - [125, 11391.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [110, 9269.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [105, 13138.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [85, 8449.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [85, 10866.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [136, 11502.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [134, 12937.0] + - - [448, 2368, 1, 128, 448, 448, 128, 128] + - [83, 10829.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [131, 12235.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [87, 12808.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [133, 7869.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [110, 12250.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [82, 8973.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [87, 14031.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [136, 7606.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [85, 10917.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [136, 11822.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [85, 13242.0] + - - [448, 1024, 1, 128, 448, 448, 128, 128] + - [129, 7395.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [132, 12987.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 128] + - [80, 6472.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [108, 11034.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [85, 10345.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [82, 12539.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [101, 8983.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [87, 15002.0] + - - [256, 1856, 1, 128, 256, 256, 128, 128] + - [130, 8196.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [134, 11683.0] + - - [448, 1408, 1, 128, 448, 448, 128, 128] + - [102, 8499.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [131, 11330.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [102, 6673.0] + - - [704, 1408, 1, 128, 704, 704, 128, 128] + - [82, 10844.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [87, 13196.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [80, 7560.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [85, 12401.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [85, 10138.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [87, 8210.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [132, 12806.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [132, 9866.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [134, 12561.0] + - - [256, 2368, 1, 128, 256, 256, 128, 128] + - [81, 8546.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [131, 11704.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [108, 10875.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [134, 13586.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [103, 10440.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [134, 11921.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [103, 10413.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [85, 12478.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 128] + - [103, 10194.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [102, 7539.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [83, 7794.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [107, 7656.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [85, 10059.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [134, 13517.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [85, 8510.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [129, 7857.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [87, 11950.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [131, 15356.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [136, 14077.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [87, 13770.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 128] + - [129, 6927.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [110, 8952.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [102, 7028.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [108, 10472.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [110, 9878.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [85, 13811.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [82, 13589.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [85, 10876.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [133, 6790.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 128] + - [81, 9796.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [85, 11158.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [104, 9771.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 128] + - [101, 8446.0] + - - [704, 448, 1, 128, 704, 704, 128, 128] + - [102, 6249.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [131, 11768.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [102, 8120.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [81, 10235.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [132, 11239.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [136, 11937.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [85, 12910.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [131, 14129.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [85, 8019.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [82, 12059.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [132, 11798.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [106, 9588.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [87, 7913.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [83, 7929.0] + - - [64, 5888, 1, 128, 64, 64, 128, 128] + - [80, 7467.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [85, 11089.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [129, 6913.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [87, 9879.0] + - - [704, 704, 1, 128, 704, 704, 128, 128] + - [117, 7990.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [80, 6043.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [129, 8738.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [110, 11844.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [129, 8950.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [85, 10469.0] + - - [256, 3584, 1, 128, 256, 256, 128, 128] + - [105, 10735.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 128] + - [82, 11768.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 128] + - [101, 8416.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [85, 8944.0] + - - [256, 2944, 1, 128, 256, 256, 128, 128] + - [103, 10198.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [87, 9459.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [106, 11586.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [87, 15429.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 128] + - [102, 10707.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [129, 8804.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [85, 9217.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [85, 10090.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [80, 7471.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [103, 9651.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [108, 13575.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [85, 10576.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [132, 12649.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 128] + - [101, 7837.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [136, 8497.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [103, 8857.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [110, 10988.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [85, 9511.0] + - - [448, 2944, 1, 128, 448, 448, 128, 128] + - [73, 11121.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [103, 7850.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [102, 8010.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [134, 12901.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [108, 13917.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [85, 12663.0] + - - [64, 5056, 1, 128, 64, 64, 128, 128] + - [80, 6512.0] + - - [64, 6784, 1, 128, 64, 64, 128, 128] + - [129, 7312.0] + - - [448, 704, 1, 128, 448, 448, 128, 128] + - [129, 6288.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 128] + - [131, 10863.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [108, 10408.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [108, 12590.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [87, 12729.0] + - - [256, 1408, 1, 128, 256, 256, 128, 128] + - [129, 6845.0] + - - [256, 4288, 1, 128, 256, 256, 128, 128] + - [81, 12155.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [103, 9570.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [87, 12294.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [133, 8896.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [87, 11907.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [129, 7794.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [131, 13355.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [133, 8873.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [87, 11236.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [85, 10744.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [106, 12848.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [103, 9247.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [129, 9359.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [80, 6158.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [85, 12640.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [108, 12354.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [105, 11377.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [136, 12305.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [131, 10068.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [105, 12500.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [110, 13673.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 128] + - [129, 6871.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [102, 9413.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [105, 12416.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [106, 11251.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [136, 13201.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [126, 8200.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [85, 13673.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [87, 14767.0] + - - [2048, 200, 1, 3200, 2048, 2048, 3200, 3200] + - [78, 10492.0] + - - [2048, 256, 1, 3328, 2048, 2048, 3328, 3328] + - [110, 13602.0] + - - [4096, 200, 1, 11264, 4096, 4096, 11264, 11264] + - [87, 10603.0] + - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] + - [105, 14263.0] + - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] + - [71, 9898.0] + - - [512, 1024, 1, 1536, 512, 512, 1536, 1536] + - [110, 13077.0] + - - [1024, 512, 1, 512, 1024, 1024, 512, 512] + - [85, 11092.0] + - - [2048, 512, 1, 640, 2048, 2048, 640, 640] + - [105, 15033.0] + - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] + - [82, 13988.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [110, 13266.0] + - - [1024, 512, 1, 128, 1024, 1024, 128, 128] + - [81, 8807.0] + - - [2048, 512, 1, 256, 2048, 2048, 256, 256] + - [82, 13250.0] + - - [4096, 200, 1, 2560, 4096, 4096, 2560, 2560] + - [105, 12051.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1152, 1152] + - [105, 15225.0] + - - [2048, 200, 1, 32, 2048, 2048, 32, 32] + - [128, 3855.0] + - - [512, 1024, 1, 2816, 512, 512, 2816, 2816] + - [110, 13542.0] + - - [2048, 200, 1, 2080, 2048, 2048, 2080, 2080] + - [74, 10837.0] + - - [2048, 200, 1, 1024, 2048, 2048, 1024, 1024] + - [87, 9317.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [105, 11526.0] + - - [1024, 512, 1, 11264, 1024, 1024, 11264, 11264] + - [110, 14010.0] + - - [1024, 1024, 1, 1792, 1024, 1024, 1792, 1792] + - [105, 14623.0] + - - [4096, 200, 1, 768, 4096, 4096, 768, 768] + - [131, 11303.0] + - - [4096, 256, 1, 1024, 4096, 4096, 1024, 1024] + - [82, 14637.0] + - - [1024, 512, 1, 256, 1024, 1024, 256, 256] + - [103, 10076.0] + - - [1024, 512, 1, 1408, 1024, 1024, 1408, 1408] + - [126, 12978.0] + - - [1024, 512, 1, 5632, 1024, 1024, 5632, 5632] + - [136, 13846.0] + - - [4096, 200, 1, 256, 4096, 4096, 256, 256] + - [108, 9691.0] + - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] + - [110, 13589.0] + - - [1024, 1024, 1, 4160, 1024, 1024, 4160, 4160] + - [96, 16077.0] + - - [2048, 256, 1, 384, 2048, 2048, 384, 384] + - [130, 11651.0] + - - [4096, 200, 1, 640, 4096, 4096, 640, 640] + - [82, 11108.0] + - - [1024, 1024, 1, 7168, 1024, 1024, 7168, 7168] + - [87, 14855.0] + - - [4096, 256, 1, 768, 4096, 4096, 768, 768] + - [105, 14897.0] + - - [2048, 256, 1, 6656, 2048, 2048, 6656, 6656] + - [110, 13870.0] + - - [2048, 200, 1, 3072, 2048, 2048, 3072, 3072] + - [87, 10439.0] + - - [1024, 512, 1, 2816, 1024, 1024, 2816, 2816] + - [110, 13547.0] + - - [4096, 256, 1, 7680, 4096, 4096, 7680, 7680] + - [136, 14740.0] + - - [4096, 200, 1, 1024, 4096, 4096, 1024, 1024] + - [105, 11447.0] + - - [2048, 200, 1, 1792, 2048, 2048, 1792, 1792] + - [87, 10066.0] + - - [1024, 1024, 1, 2816, 1024, 1024, 2816, 2816] + - [110, 14730.0] + - - [2048, 512, 1, 1536, 2048, 2048, 1536, 1536] + - [136, 14563.0] + - - [4096, 256, 1, 3072, 4096, 4096, 3072, 3072] + - [110, 14730.0] + - - [2048, 256, 1, 5632, 2048, 2048, 5632, 5632] + - [136, 13824.0] + - - [1024, 512, 1, 6656, 1024, 1024, 6656, 6656] + - [136, 13893.0] + - - [4096, 200, 1, 2080, 4096, 4096, 2080, 2080] + - [74, 12290.0] + - - [2048, 200, 1, 13312, 2048, 2048, 13312, 13312] + - [87, 10933.0] + - - [4096, 256, 1, 3584, 4096, 4096, 3584, 3584] + - [131, 14852.0] + - - [2048, 256, 1, 8192, 2048, 2048, 8192, 8192] + - [136, 13967.0] + - - [2048, 512, 1, 512, 2048, 2048, 512, 512] + - [82, 13901.0] + - - [2048, 512, 1, 1152, 2048, 2048, 1152, 1152] + - [131, 15237.0] + - - [2048, 200, 1, 9216, 2048, 2048, 9216, 9216] + - [87, 10872.0] + - - [2048, 200, 1, 2560, 2048, 2048, 2560, 2560] + - [110, 10282.0] + - - [2048, 256, 1, 4608, 2048, 2048, 4608, 4608] + - [110, 13760.0] + - - [2048, 256, 1, 3584, 2048, 2048, 3584, 3584] + - [87, 13658.0] + - - [1024, 512, 1, 640, 1024, 1024, 640, 640] + - [81, 12114.0] + - - [2048, 512, 1, 768, 2048, 2048, 768, 768] + - [131, 14679.0] + - - [2048, 200, 1, 1408, 2048, 2048, 1408, 1408] + - [131, 9923.0] + - - [4096, 200, 1, 2048, 4096, 4096, 2048, 2048] + - [131, 11827.0] + - - [1024, 1024, 1, 5632, 1024, 1024, 5632, 5632] + - [110, 14843.0] + - - [2048, 512, 1, 3584, 2048, 2048, 3584, 3584] + - [87, 14768.0] + - - [1024, 512, 1, 64, 1024, 1024, 64, 64] + - [120, 6631.0] + - - [4096, 200, 1, 7680, 4096, 4096, 7680, 7680] + - [110, 11555.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [110, 14441.0] + - - [2048, 200, 1, 896, 2048, 2048, 896, 896] + - [130, 9951.0] + - - [2048, 256, 1, 32, 2048, 2048, 32, 32] + - [114, 4766.0] + - - [2048, 256, 1, 1280, 2048, 2048, 1280, 1280] + - [136, 12763.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [87, 14747.0] + - - [2048, 256, 1, 11264, 2048, 2048, 11264, 11264] + - [136, 14005.0] + - - [4096, 200, 1, 9216, 4096, 4096, 9216, 9216] + - [87, 11377.0] + - - [1024, 512, 1, 4096, 1024, 1024, 4096, 4096] + - [110, 13465.0] + - - [4096, 200, 1, 3840, 4096, 4096, 3840, 3840] + - [131, 11595.0] + - - [1024, 1024, 1, 1920, 1024, 1024, 1920, 1920] + - [74, 15373.0] + - - [2048, 200, 1, 7168, 2048, 2048, 7168, 7168] + - [87, 10800.0] + - - [4096, 256, 1, 1152, 4096, 4096, 1152, 1152] + - [131, 15294.0] + - - [2048, 256, 1, 1920, 2048, 2048, 1920, 1920] + - [110, 13217.0] + - - [2048, 512, 1, 4160, 2048, 2048, 4160, 4160] + - [96, 16105.0] + - - [2048, 512, 1, 5632, 2048, 2048, 5632, 5632] + - [87, 14831.0] + - - [4096, 256, 1, 7168, 4096, 4096, 7168, 7168] + - [136, 14525.0] + - - [4096, 200, 1, 128, 4096, 4096, 128, 128] + - [130, 8886.0] + - - [2048, 200, 1, 5120, 2048, 2048, 5120, 5120] + - [110, 10666.0] + - - [1024, 1024, 1, 6656, 1024, 1024, 6656, 6656] + - [87, 14835.0] + - - [512, 1024, 1, 3200, 512, 512, 3200, 3200] + - [100, 13636.0] + - - [2048, 256, 1, 1536, 2048, 2048, 1536, 1536] + - [110, 13006.0] + - - [4096, 256, 1, 256, 4096, 4096, 256, 256] + - [131, 13276.0] + - - [2048, 512, 1, 1408, 2048, 2048, 1408, 1408] + - [105, 15350.0] + - - [1024, 512, 1, 2080, 1024, 1024, 2080, 2080] + - [96, 14379.0] + - - [2048, 512, 1, 2304, 2048, 2048, 2304, 2304] + - [105, 14858.0] + - - [4096, 200, 1, 512, 4096, 4096, 512, 512] + - [82, 10744.0] + - - [2048, 200, 1, 1280, 2048, 2048, 1280, 1280] + - [87, 9734.0] + - - [1024, 1024, 1, 2304, 1024, 1024, 2304, 2304] + - [105, 14825.0] + - - [2048, 512, 1, 4608, 2048, 2048, 4608, 4608] + - [110, 14805.0] + - - [4096, 256, 1, 6144, 4096, 4096, 6144, 6144] + - [87, 14769.0] + - - [4096, 256, 1, 896, 4096, 4096, 896, 896] + - [105, 15085.0] + - - [2048, 256, 1, 640, 2048, 2048, 640, 640] + - [130, 12354.0] + - - [2048, 512, 1, 384, 2048, 2048, 384, 384] + - [105, 14442.0] + - - [2048, 200, 1, 16384, 2048, 2048, 16384, 16384] + - [90, 9292.0] + - - [4096, 200, 1, 10240, 4096, 4096, 10240, 10240] + - [110, 10948.0] + - - [1024, 512, 1, 9216, 1024, 1024, 9216, 9216] + - [136, 13852.0] + - - [4096, 200, 1, 1920, 4096, 4096, 1920, 1920] + - [131, 11990.0] + - - [2048, 512, 1, 7680, 2048, 2048, 7680, 7680] + - [136, 14857.0] + - - [1024, 512, 1, 3584, 1024, 1024, 3584, 3584] + - [110, 13680.0] + - - [1024, 1024, 1, 32, 1024, 1024, 32, 32] + - [73, 6684.0] + - - [2048, 512, 1, 1664, 2048, 2048, 1664, 1664] + - [105, 15521.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [87, 10131.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 3584, 3584] + - [110, 14779.0] + - - [4096, 256, 1, 6656, 4096, 4096, 6656, 6656] + - [110, 14836.0] + - - [4096, 256, 1, 4160, 4096, 4096, 4160, 4160] + - [96, 16081.0] + - - [2048, 256, 1, 3072, 2048, 2048, 3072, 3072] + - [110, 13573.0] + - - [2048, 256, 1, 8320, 2048, 2048, 8320, 8320] + - [100, 13961.0] + - - [1024, 512, 1, 3200, 1024, 1024, 3200, 3200] + - [126, 13636.0] + - - [1024, 512, 1, 896, 1024, 1024, 896, 896] + - [105, 12601.0] + - - [2048, 512, 1, 1280, 2048, 2048, 1280, 1280] + - [105, 14943.0] + - - [4096, 200, 1, 64, 4096, 4096, 64, 64] + - [95, 7162.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 5120, 5120] + - [87, 14819.0] + - - [2048, 512, 1, 6656, 2048, 2048, 6656, 6656] + - [87, 14846.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 128] + - [105, 11899.0] + - - [512, 1024, 1, 1792, 512, 512, 1792, 1792] + - [126, 13225.0] + - - [4096, 256, 1, 2816, 4096, 4096, 2816, 2816] + - [105, 15411.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 4096, 4096] + - [110, 14765.0] + - - [2048, 200, 1, 4160, 2048, 2048, 4160, 4160] + - [74, 11298.0] + - - [1024, 512, 1, 768, 1024, 1024, 768, 768] + - [136, 12135.0] + - - [4096, 200, 1, 8320, 4096, 4096, 8320, 8320] + - [131, 11811.0] + - - [2048, 512, 1, 896, 2048, 2048, 896, 896] + - [105, 14946.0] + - - [4096, 200, 1, 7168, 4096, 4096, 7168, 7168] + - [82, 11568.0] + - - [2048, 200, 1, 3840, 2048, 2048, 3840, 3840] + - [126, 10440.0] + - - [1024, 1024, 1, 768, 1024, 1024, 768, 768] + - [105, 14589.0] + - - [4096, 256, 1, 2304, 4096, 4096, 2304, 2304] + - [131, 15266.0] + - - [2048, 200, 1, 16640, 2048, 2048, 16640, 16640] + - [85, 9996.0] + - - [2048, 256, 1, 2816, 2048, 2048, 2816, 2816] + - [110, 13119.0] + - - [1024, 512, 1, 384, 1024, 1024, 384, 384] + - [81, 11843.0] + - - [2048, 200, 1, 7680, 2048, 2048, 7680, 7680] + - [87, 10769.0] + - - [1024, 512, 1, 4608, 1024, 1024, 4608, 4608] + - [136, 13791.0] + - - [4096, 200, 1, 32, 4096, 4096, 32, 32] + - [114, 5350.0] + - - [4096, 200, 1, 3328, 4096, 4096, 3328, 3328] + - [131, 11836.0] + - - [1024, 1024, 1, 1408, 1024, 1024, 1408, 1408] + - [105, 15268.0] + - - [2048, 200, 1, 15360, 2048, 2048, 15360, 15360] + - [134, 9720.0] + - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] + - [110, 12816.0] + - - [4096, 256, 1, 5632, 4096, 4096, 5632, 5632] + - [110, 14783.0] + - - [2048, 256, 1, 1408, 2048, 2048, 1408, 1408] + - [105, 12879.0] + - - [2048, 256, 1, 6144, 2048, 2048, 6144, 6144] + - [110, 13874.0] + - - [4096, 256, 1, 3328, 4096, 4096, 3328, 3328] + - [131, 15389.0] + - - [2048, 512, 1, 6144, 2048, 2048, 6144, 6144] + - [110, 14850.0] + - - [2048, 512, 1, 3200, 2048, 2048, 3200, 3200] + - [105, 15661.0] + - - [2048, 200, 1, 4608, 2048, 2048, 4608, 4608] + - [87, 10561.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 6144, 6144] + - [110, 14840.0] + - - [4096, 256, 1, 1664, 4096, 4096, 1664, 1664] + - [131, 15463.0] + - - [2048, 200, 1, 384, 2048, 2048, 384, 384] + - [103, 8886.0] + - - [4096, 256, 1, 1792, 4096, 4096, 1792, 1792] + - [87, 14591.0] + - - [2048, 512, 1, 2816, 2048, 2048, 2816, 2816] + - [105, 14816.0] + - - [4096, 256, 1, 384, 4096, 4096, 384, 384] + - [131, 14218.0] + - - [2048, 256, 1, 128, 2048, 2048, 128, 128] + - [81, 8670.0] + - - [1024, 1024, 1, 640, 1024, 1024, 640, 640] + - [105, 14821.0] + - - [4096, 200, 1, 5632, 4096, 4096, 5632, 5632] + - [82, 11991.0] + - - [2048, 200, 1, 1152, 2048, 2048, 1152, 1152] + - [103, 9786.0] + - - [4096, 256, 1, 512, 4096, 4096, 512, 512] + - [131, 14309.0] + - - [1024, 1024, 1, 384, 1024, 1024, 384, 384] + - [105, 14370.0] + - - [2048, 200, 1, 512, 2048, 2048, 512, 512] + - [130, 8702.0] + - - [2048, 256, 1, 9216, 2048, 2048, 9216, 9216] + - [136, 13977.0] + - - [2048, 256, 1, 1792, 2048, 2048, 1792, 1792] + - [110, 13144.0] + - - [4096, 200, 1, 1792, 4096, 4096, 1792, 1792] + - [105, 11974.0] + - - [2048, 200, 1, 1536, 2048, 2048, 1536, 1536] + - [87, 9852.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] + - [87, 14738.0] + - - [1024, 1024, 1, 2080, 1024, 1024, 2080, 2080] + - [96, 15976.0] + - - [2048, 200, 1, 2304, 2048, 2048, 2304, 2304] + - [87, 10247.0] + - - [2048, 256, 1, 7168, 2048, 2048, 7168, 7168] + - [136, 13915.0] + - - [2048, 512, 1, 1792, 2048, 2048, 1792, 1792] + - [136, 14607.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 4608, 4608] + - [110, 14822.0] + - - [512, 1024, 1, 1280, 512, 512, 1280, 1280] + - [136, 12876.0] + - - [2048, 256, 1, 3200, 2048, 2048, 3200, 3200] + - [110, 13591.0] + - - [1024, 512, 1, 3328, 1024, 1024, 3328, 3328] + - [110, 13646.0] + - - [1024, 512, 1, 4160, 1024, 1024, 4160, 4160] + - [121, 14677.0] + - - [4096, 200, 1, 6656, 4096, 4096, 6656, 6656] + - [82, 11728.0] + - - [2048, 200, 1, 3328, 2048, 2048, 3328, 3328] + - [87, 10447.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [82, 13172.0] + - - [2048, 256, 1, 64, 2048, 2048, 64, 64] + - [80, 6631.0] + - - [2048, 256, 1, 2304, 2048, 2048, 2304, 2304] + - [136, 13354.0] + - - [4096, 200, 1, 8192, 4096, 4096, 8192, 8192] + - [136, 10782.0] + - - [1024, 512, 1, 7168, 1024, 1024, 7168, 7168] + - [110, 13771.0] + - - [1024, 512, 1, 1792, 1024, 1024, 1792, 1792] + - [110, 13199.0] + - - [4096, 200, 1, 2816, 4096, 4096, 2816, 2816] + - [131, 12116.0] + - - [1024, 1024, 1, 896, 1024, 1024, 896, 896] + - [131, 15081.0] + - - [4096, 256, 1, 5120, 4096, 4096, 5120, 5120] + - [87, 14801.0] + - - [4096, 256, 1, 2048, 4096, 4096, 2048, 2048] + - [110, 14625.0] + - - [2048, 256, 1, 5120, 2048, 2048, 5120, 5120] + - [110, 13797.0] + - - [2048, 256, 1, 7680, 2048, 2048, 7680, 7680] + - [110, 13936.0] + - - [2048, 200, 1, 3584, 2048, 2048, 3584, 3584] + - [87, 10502.0] + - - [1024, 512, 1, 1536, 1024, 1024, 1536, 1536] + - [110, 13052.0] + - - [2048, 200, 1, 64, 2048, 2048, 64, 64] + - [117, 5243.0] + - - [2048, 200, 1, 4096, 2048, 2048, 4096, 4096] + - [87, 10586.0] + - - [1024, 1024, 1, 1536, 1024, 1024, 1536, 1536] + - [110, 14560.0] + - - [4096, 256, 1, 32, 4096, 4096, 32, 32] + - [73, 6579.0] + - - [4096, 256, 1, 1280, 4096, 4096, 1280, 1280] + - [105, 14996.0] + - - [2048, 256, 1, 1024, 2048, 2048, 1024, 1024] + - [110, 12468.0] + - - [1024, 512, 1, 1152, 1024, 1024, 1152, 1152] + - [100, 12726.0] + - - [2048, 512, 1, 3328, 2048, 2048, 3328, 3328] + - [87, 14768.0] + - - [4096, 200, 1, 3584, 4096, 4096, 3584, 3584] + - [131, 12055.0] + - - [2048, 200, 1, 256, 2048, 2048, 256, 256] + - [103, 7837.0] + - - [4096, 256, 1, 1920, 4096, 4096, 1920, 1920] + - [96, 15453.0] + - - [2048, 256, 1, 1664, 2048, 2048, 1664, 1664] + - [78, 13068.0] + - - [4096, 200, 1, 5120, 4096, 4096, 5120, 5120] + - [87, 11533.0] + - - [1024, 512, 1, 8192, 1024, 1024, 8192, 8192] + - [136, 13983.0] + - - [4096, 200, 1, 896, 4096, 4096, 896, 896] + - [105, 11498.0] + - - [2048, 200, 1, 640, 2048, 2048, 640, 640] + - [103, 9588.0] + - - [4096, 200, 1, 1408, 4096, 4096, 1408, 1408] + - [131, 11876.0] + - - [2048, 200, 1, 5632, 2048, 2048, 5632, 5632] + - [87, 10716.0] + - - [1024, 512, 1, 2560, 1024, 1024, 2560, 2560] + - [136, 13486.0] + - - [4096, 200, 1, 1280, 4096, 4096, 1280, 1280] + - [105, 11703.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 2560, 2560] + - [87, 14693.0] + - - [2048, 512, 1, 64, 2048, 2048, 64, 64] + - [71, 9840.0] + - - [2048, 200, 1, 8192, 2048, 2048, 8192, 8192] + - [87, 10852.0] + - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] + - [136, 14752.0] + - - [4096, 256, 1, 640, 4096, 4096, 640, 640] + - [105, 14860.0] + - - [2048, 256, 1, 4096, 2048, 2048, 4096, 4096] + - [136, 13718.0] + - - [4096, 200, 1, 1664, 4096, 4096, 1664, 1664] + - [105, 11993.0] + - - [2048, 200, 1, 6656, 2048, 2048, 6656, 6656] + - [87, 10783.0] + - - [512, 1024, 1, 768, 512, 512, 768, 768] + - [110, 12157.0] + - - [2048, 200, 1, 8320, 2048, 2048, 8320, 8320] + - [78, 10859.0] + - - [4096, 256, 1, 3840, 4096, 4096, 3840, 3840] + - [110, 14784.0] + - - [1024, 1024, 1, 3200, 1024, 1024, 3200, 3200] + - [105, 15601.0] + - - [4096, 256, 1, 4608, 4096, 4096, 4608, 4608] + - [87, 14793.0] + - - [1024, 512, 1, 32, 1024, 1024, 32, 32] + - [115, 4766.0] + - - [1024, 512, 1, 3840, 1024, 1024, 3840, 3840] + - [136, 13711.0] + - - [2048, 512, 1, 1920, 2048, 2048, 1920, 1920] + - [121, 15477.0] + - - [4096, 200, 1, 6144, 4096, 4096, 6144, 6144] + - [82, 11593.0] + - - [2048, 200, 1, 2816, 2048, 2048, 2816, 2816] + - [87, 10356.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 3840, 3840] + - [110, 14789.0] + - - [2048, 256, 1, 3840, 2048, 2048, 3840, 3840] + - [110, 13688.0] + - - [1024, 512, 1, 7680, 1024, 1024, 7680, 7680] + - [110, 13945.0] + - - [2048, 200, 1, 10240, 2048, 2048, 10240, 10240] + - [87, 10886.0] + - - [2048, 512, 1, 5120, 2048, 2048, 5120, 5120] + - [110, 14836.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [110, 11260.0] + - - [2048, 512, 1, 32, 2048, 2048, 32, 32] + - [96, 6554.0] + - - [4096, 256, 1, 2560, 4096, 4096, 2560, 2560] + - [136, 14698.0] + - - [4096, 256, 1, 64, 4096, 4096, 64, 64] + - [73, 9811.0] + - - [2048, 200, 1, 768, 2048, 2048, 768, 768] + - [82, 9177.0] + - - [2048, 512, 1, 2560, 2048, 2048, 2560, 2560] + - [87, 14709.0] + - - [2048, 512, 1, 7168, 2048, 2048, 7168, 7168] + - [110, 14862.0] + - - [2048, 512, 1, 128, 2048, 2048, 128, 128] + - [103, 11651.0] + - - [4096, 200, 1, 2304, 4096, 4096, 2304, 2304] + - [105, 12040.0] + - - [2048, 512, 1, 4096, 2048, 2048, 4096, 4096] + - [87, 14777.0] + - - [2048, 256, 1, 2560, 2048, 2048, 2560, 2560] + - [110, 13433.0] + - - [2048, 256, 1, 4160, 2048, 2048, 4160, 4160] + - [74, 14642.0] + - - [1024, 512, 1, 1664, 1024, 1024, 1664, 1664] + - [136, 13171.0] + - - [2048, 512, 1, 2080, 2048, 2048, 2080, 2080] + - [96, 15981.0] + - - [2048, 512, 1, 3840, 2048, 2048, 3840, 3840] + - [136, 14791.0] + - - [4096, 200, 1, 3072, 4096, 4096, 3072, 3072] + - [87, 11439.0] + - - [1024, 1024, 1, 1664, 1024, 1024, 1664, 1664] + - [131, 15422.0] + - - [512, 1024, 1, 2304, 512, 512, 2304, 2304] + - [136, 13446.0] + - - [4096, 256, 1, 1408, 4096, 4096, 1408, 1408] + - [105, 15405.0] + - - [2048, 256, 1, 1152, 2048, 2048, 1152, 1152] + - [131, 12764.0] + - - [1024, 512, 1, 1280, 1024, 1024, 1280, 1280] + - [110, 12876.0] + - - [2048, 200, 1, 12288, 2048, 2048, 12288, 12288] + - [136, 10931.0] + - - [2048, 200, 1, 1664, 2048, 2048, 1664, 1664] + - [105, 10059.0] + - - [4096, 200, 1, 4608, 4096, 4096, 4608, 4608] + - [105, 11708.0] + - - [512, 1024, 1, 2560, 512, 512, 2560, 2560] + - [110, 13503.0] + - - [4096, 200, 1, 384, 4096, 4096, 384, 384] + - [130, 10900.0] + - - [2048, 200, 1, 128, 2048, 2048, 128, 128] + - [129, 6521.0] + - - [2048, 200, 1, 11264, 2048, 2048, 11264, 11264] + - [87, 10914.0] + - - [1024, 512, 1, 1920, 1024, 1024, 1920, 1920] + - [100, 13273.0] + - - [4096, 256, 1, 1536, 4096, 4096, 1536, 1536] + - [131, 15069.0] + - - [2048, 256, 1, 256, 2048, 2048, 256, 256] + - [130, 10031.0] + - - [2048, 256, 1, 10240, 2048, 2048, 10240, 10240] + - [110, 14007.0] + - - [1024, 512, 1, 5120, 1024, 1024, 5120, 5120] + - [110, 13824.0] + - - [1024, 512, 1, 8320, 1024, 1024, 8320, 8320] + - [126, 13985.0] + - - [1024, 512, 1, 10240, 1024, 1024, 10240, 10240] + - [110, 14007.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] + - [110, 14661.0] + - - [2048, 256, 1, 2080, 2048, 2048, 2080, 2080] + - [74, 14304.0] + - - [4096, 256, 1, 128, 4096, 4096, 128, 128] + - [130, 11511.0] + - - [2048, 256, 1, 896, 2048, 2048, 896, 896] + - [103, 12669.0] + - - [4096, 200, 1, 1152, 4096, 4096, 1152, 1152] + - [105, 11720.0] + - - [2048, 200, 1, 6144, 2048, 2048, 6144, 6144] + - [87, 10751.0] + - - [1024, 1024, 1, 7680, 1024, 1024, 7680, 7680] + - [110, 14845.0] + - - [2048, 200, 1, 1920, 2048, 2048, 1920, 1920] + - [78, 9998.0] + - - [4096, 256, 1, 2080, 4096, 4096, 2080, 2080] + - [121, 15911.0] + - - [2048, 200, 1, 14336, 2048, 2048, 14336, 14336] + - [136, 10872.0] + - - [1024, 512, 1, 6144, 1024, 1024, 6144, 6144] + - [136, 13698.0] + - - [1024, 512, 1, 2304, 1024, 1024, 2304, 2304] + - [110, 13362.0] + - - [4096, 200, 1, 4160, 4096, 4096, 4160, 4160] + - [74, 12486.0] + - - [4096, 200, 1, 1536, 4096, 4096, 1536, 1536] + - [105, 11839.0] + - - [2048, 320, 1, 64, 2048, 2048, 64, 64] + - [115, 7257.0] + - - [2048, 384, 1, 64, 2048, 2048, 64, 64] + - [71, 8251.0] + - - [1024, 384, 1, 289, 1024, 1024, 289, 289] + - [94, 8934.0] + - - [2048, 448, 1, 64, 2048, 2048, 64, 64] + - [71, 8764.0] + - - [102, 101, 624, 64, 102, 102, 64, 64] + - [100, 7807.0] + - - [101, 101, 624, 64, 101, 101, 64, 64] + - [110, 7722.0] + - - [85, 85, 752, 64, 85, 85, 64, 64] + - [116, 6366.0] + - - [112, 111, 576, 64, 112, 112, 64, 64] + - [75, 9111.0] + - - [65, 65, 992, 64, 65, 65, 64, 64] + - [116, 4262.0] + - - [77, 77, 816, 64, 77, 77, 64, 64] + - [71, 5785.0] + - - [111, 111, 576, 64, 111, 111, 64, 64] + - [75, 8424.0] + - - [84, 85, 752, 64, 84, 84, 64, 64] + - [71, 6409.0] + - - [84, 84, 752, 64, 84, 84, 64, 64] + - [71, 6407.0] + - - [71, 71, 896, 64, 71, 71, 64, 64] + - [116, 5017.0] + - - [122, 122, 528, 64, 122, 122, 64, 64] + - [124, 8946.0] + - - [78, 78, 816, 64, 78, 78, 64, 64] + - [116, 5877.0] + - - [112, 112, 576, 64, 112, 112, 64, 64] + - [75, 9201.0] + - - [77, 78, 816, 64, 77, 77, 64, 64] + - [93, 5743.0] + - - [111, 112, 576, 64, 111, 111, 64, 64] + - [122, 8372.0] + - - [92, 93, 688, 64, 92, 92, 64, 64] + - [78, 7042.0] + - - [102, 102, 624, 64, 102, 102, 64, 64] + - [110, 7857.0] + - - [99, 99, 624, 64, 99, 99, 64, 64] + - [78, 7690.0] + - - [100, 102, 624, 64, 100, 100, 64, 64] + - [100, 7807.0] + - - [123, 122, 528, 64, 123, 123, 64, 64] + - [98, 8934.0] + - - [99, 102, 624, 64, 99, 99, 64, 64] + - [136, 7690.0] + - - [93, 93, 688, 64, 93, 93, 64, 64] + - [78, 7003.0] + - - [123, 123, 528, 64, 123, 123, 64, 64] + - [98, 8928.0] + - - [100, 100, 624, 64, 100, 100, 64, 64] + - [87, 7581.0] + - - [101, 102, 624, 64, 101, 101, 64, 64] + - [78, 7786.0] + - - [102, 100, 624, 64, 102, 102, 64, 64] + - [87, 7789.0] + - - [92, 92, 688, 64, 92, 92, 64, 64] + - [136, 6990.0] + - - [3072, 128, 1, 4096, 3072, 3072, 4096, 4096] + - [136, 10346.0] + - - [1728, 320, 1, 64, 1728, 1728, 64, 64] + - [72, 7435.0] + - - [1440, 320, 1, 196, 1440, 1440, 196, 196] + - [73, 8752.0] + - - [2592, 384, 1, 289, 2592, 2592, 289, 289] + - [74, 13231.0] + - - [192, 80, 36, 10368, 192, 192, 10368, 10368] + - [136, 7650.0] + - - [1280, 384, 1, 64, 1280, 1280, 64, 64] + - [123, 5558.0] + - - [1280, 448, 1, 64, 1280, 1280, 64, 64] + - [129, 7168.0] + - - [3456, 256, 1, 169, 3456, 3456, 169, 169] + - [118, 11027.0] + - - [2304, 256, 1, 196, 2304, 2304, 196, 196] + - [95, 10947.0] + - - [224, 192, 36, 2592, 224, 224, 2592, 2592] + - [77, 13516.0] + - - [192, 128, 36, 1568, 192, 192, 1568, 1568] + - [131, 12948.0] + - - [1296, 288, 1, 196, 1296, 1296, 196, 196] + - [72, 7833.0] + - - [192, 64, 36, 6272, 192, 192, 6272, 6272] + - [85, 9039.0] + - - [1728, 224, 1, 1225, 1728, 1728, 1225, 1225] + - [73, 9760.0] + - - [1152, 384, 1, 64, 1152, 1152, 64, 64] + - [80, 6291.0] + - - [1792, 256, 1, 289, 1792, 1792, 289, 289] + - [73, 9998.0] + - - [1728, 384, 1, 169, 1728, 1728, 169, 169] + - [93, 9768.0] + - - [1568, 256, 1, 289, 1568, 1568, 289, 289] + - [72, 9222.0] + - - [1152, 448, 1, 64, 1152, 1152, 64, 64] + - [72, 7119.0] + - - [1536, 256, 1, 64, 1536, 1536, 64, 64] + - [70, 5694.0] + - - [1440, 320, 1, 49, 1440, 1440, 49, 49] + - [92, 5428.0] + - - [1344, 512, 1, 64, 1344, 1344, 64, 64] + - [117, 8125.0] + - - [1152, 256, 1, 196, 1152, 1152, 196, 196] + - [72, 8234.0] + - - [1728, 192, 1, 1225, 1728, 1728, 1225, 1225] + - [92, 8797.0] + - - [2048, 512, 1, 49, 2048, 2048, 49, 49] + - [73, 8130.0] + - - [512, 2048, 1, 49, 512, 512, 49, 49] + - [71, 8260.0] + - - [1728, 192, 1, 64, 1728, 1728, 64, 64] + - [92, 5389.0] + - - [1536, 384, 1, 64, 1536, 1536, 64, 64] + - [120, 7344.0] + - - [2048, 192, 1, 64, 2048, 2048, 64, 64] + - [92, 5963.0] + - - [128, 96, 36, 1568, 128, 128, 1568, 1568] + - [95, 11390.0] + - - [128, 128, 36, 3136, 128, 128, 3136, 3136] + - [121, 16132.0] + - - [1280, 320, 1, 64, 1280, 1280, 64, 64] + - [129, 5904.0] + - - [1792, 320, 1, 289, 1792, 1792, 289, 289] + - [73, 12168.0] + - - [2880, 320, 1, 64, 2880, 2880, 64, 64] + - [93, 9046.0] + - - [1728, 384, 1, 49, 1728, 1728, 49, 49] + - [94, 6636.0] + - - [512, 1024, 1, 196, 512, 512, 196, 196] + - [73, 10400.0] + - - [224, 192, 36, 5184, 224, 224, 5184, 5184] + - [125, 13672.0] + - - [192, 80, 36, 20736, 192, 192, 20736, 20736] + - [136, 7249.0] + - - [224, 192, 64, 4608, 224, 224, 4608, 4608] + - [108, 11258.0] + - - [224, 192, 64, 2304, 224, 224, 2304, 2304] + - [108, 12700.0] + - - [192, 80, 49, 14400, 192, 192, 14400, 14400] + - [96, 7623.0] + - - [224, 192, 49, 6272, 224, 224, 6272, 6272] + - [134, 12505.0] + - - [224, 192, 49, 3136, 224, 224, 3136, 3136] + - [77, 12580.0] + - - [192, 80, 36, 41472, 192, 192, 41472, 41472] + - [140, 6936.0] + - - [192, 80, 49, 28800, 192, 192, 28800, 28800] + - [87, 7347.0] + - - [192, 80, 64, 9216, 192, 192, 9216, 9216] + - [137, 5827.0] + - - [256, 224, 9, 9792, 256, 256, 9792, 9792] + - [87, 13461.0] + - - [256, 256, 9, 4896, 256, 256, 4896, 4896] + - [74, 16258.0] + - - [320, 256, 9, 4896, 320, 320, 4896, 4896] + - [93, 12296.0] + - - [224, 192, 9, 19584, 224, 224, 19584, 19584] + - [136, 10227.0] + - - [192, 192, 11, 3264, 192, 192, 3264, 3264] + - [72, 10924.0] + - - [192, 192, 11, 6528, 192, 192, 6528, 6528] + - [122, 10028.0] + - - [192, 192, 9, 4896, 192, 192, 4896, 4896] + - [74, 9207.0] + - - [224, 192, 11, 6528, 224, 224, 6528, 6528] + - [125, 11503.0] + - - [192, 192, 9, 19584, 192, 192, 19584, 19584] + - [110, 8777.0] + - - [256, 224, 11, 13056, 256, 256, 13056, 13056] + - [85, 10337.0] + - - [224, 192, 11, 13056, 224, 224, 13056, 13056] + - [108, 10539.0] + - - [256, 256, 11, 3264, 256, 256, 3264, 3264] + - [118, 13525.0] + - - [320, 256, 11, 6528, 320, 320, 6528, 6528] + - [136, 12672.0] + - - [192, 192, 9, 9792, 192, 192, 9792, 9792] + - [129, 9079.0] + - - [224, 224, 9, 9792, 224, 224, 9792, 9792] + - [87, 11852.0] + - - [224, 192, 11, 3264, 224, 224, 3264, 3264] + - [95, 12397.0] + - - [224, 224, 11, 6528, 224, 224, 6528, 6528] + - [85, 9550.0] + - - [224, 224, 9, 19584, 224, 224, 19584, 19584] + - [78, 11693.0] + - - [192, 192, 11, 13056, 192, 192, 13056, 13056] + - [133, 9598.0] + - - [224, 224, 9, 4896, 224, 224, 4896, 4896] + - [74, 12366.0] + - - [320, 256, 11, 3264, 320, 320, 3264, 3264] + - [121, 13731.0] + - - [256, 256, 11, 6528, 256, 256, 6528, 6528] + - [77, 12283.0] + - - [224, 192, 9, 4896, 224, 224, 4896, 4896] + - [74, 10611.0] + - - [224, 224, 11, 13056, 224, 224, 13056, 13056] + - [134, 9295.0] + - - [224, 224, 11, 3264, 224, 224, 3264, 3264] + - [95, 10348.0] + - - [256, 224, 11, 6528, 256, 256, 6528, 6528] + - [99, 10858.0] + - - [256, 224, 11, 3264, 256, 256, 3264, 3264] + - [95, 11822.0] + - - [224, 192, 9, 9792, 224, 224, 9792, 9792] + - [100, 10173.0] + - - [256, 224, 9, 4896, 256, 256, 4896, 4896] + - [74, 13976.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [76, 7685.0] + - - [135, 135, 32, 64, 135, 135, 64, 64] + - [93, 5071.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [72, 5737.0] + - - [65, 65, 472, 64, 65, 65, 64, 64] + - [93, 4006.0] + - - [65, 65, 496, 64, 65, 65, 64, 64] + - [71, 3973.0] + - - [70, 70, 216, 64, 70, 70, 64, 64] + - [91, 3980.0] + - - [70, 71, 216, 64, 70, 70, 64, 64] + - [93, 4119.0] + - - [71, 71, 216, 64, 71, 71, 64, 64] + - [70, 4066.0] + - - [71, 71, 448, 64, 71, 71, 64, 64] + - [116, 4726.0] + - - [77, 77, 248, 64, 77, 77, 64, 64] + - [93, 4942.0] + - - [77, 77, 408, 64, 77, 77, 64, 64] + - [116, 5309.0] + - - [77, 78, 248, 64, 77, 77, 64, 64] + - [71, 4884.0] + - - [77, 78, 408, 64, 77, 77, 64, 64] + - [116, 5393.0] + - - [78, 78, 248, 64, 78, 78, 64, 64] + - [116, 4957.0] + - - [78, 78, 408, 64, 78, 78, 64, 64] + - [93, 5456.0] + - - [80, 80, 152, 64, 80, 80, 64, 64] + - [115, 4760.0] + - - [80, 84, 152, 64, 80, 80, 64, 64] + - [116, 4967.0] + - - [84, 84, 152, 64, 84, 84, 64, 64] + - [93, 4903.0] + - - [85, 85, 376, 64, 85, 85, 64, 64] + - [116, 6012.0] + - - [93, 93, 344, 64, 93, 93, 64, 64] + - [116, 6429.0] + - - [102, 102, 312, 64, 102, 102, 64, 64] + - [100, 6861.0] + - - [112, 112, 288, 64, 112, 112, 64, 64] + - [122, 8164.0] + - - [122, 122, 264, 64, 122, 122, 64, 64] + - [98, 8035.0] + - - [123, 122, 264, 64, 123, 123, 64, 64] + - [124, 8189.0] + - - [123, 123, 264, 64, 123, 123, 64, 64] + - [124, 8193.0] + - - [511, 2048, 1, 2048, 511, 511, 2048, 2048] + - [110, 14454.0] + - - [1024, 512, 1, 1025, 1024, 1024, 1025, 1025] + - [73, 13415.0] + - - [512, 1023, 1, 1024, 512, 512, 1024, 1024] + - [87, 12234.0] + - - [1025, 1024, 1, 1024, 1025, 1025, 1024, 1024] + - [110, 14277.0] + - - [2048, 513, 1, 2048, 2048, 2048, 2048, 2048] + - [108, 13644.0] + - - [1024, 1024, 1, 1025, 1024, 1024, 1025, 1025] + - [96, 15460.0] + - - [960, 1024, 1, 1023, 960, 960, 1023, 1023] + - [96, 14453.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [105, 14363.0] + - - [960, 1025, 1, 1024, 960, 960, 1024, 1024] + - [82, 13424.0] + - - [2049, 512, 1, 2048, 2049, 2049, 2048, 2048] + - [87, 14470.0] + - - [513, 1024, 1, 1024, 513, 513, 1024, 1024] + - [106, 10898.0] + - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] + - [82, 14790.0] + - - [1024, 511, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 11902.0] + - - [1024, 512, 1, 1023, 1024, 1024, 1023, 1023] + - [73, 13429.0] + - - [960, 1024, 1, 1025, 960, 960, 1025, 1025] + - [74, 14452.0] + - - [959, 1024, 1, 1024, 959, 959, 1024, 1024] + - [82, 13418.0] + - - [2048, 512, 1, 2049, 2048, 2048, 2049, 2049] + - [96, 15859.0] + - - [511, 1024, 1, 1024, 511, 511, 1024, 1024] + - [110, 12172.0] + - - [512, 2049, 1, 2048, 512, 512, 2048, 2048] + - [87, 14580.0] + - - [1024, 513, 1, 1024, 1024, 1024, 1024, 1024] + - [85, 11563.0] + - - [2048, 512, 1, 2047, 2048, 2048, 2047, 2047] + - [121, 15848.0] + - - [1025, 512, 1, 1024, 1025, 1025, 1024, 1024] + - [87, 12331.0] + - - [1024, 1024, 1, 1023, 1024, 1024, 1023, 1023] + - [96, 15528.0] + - - [513, 2048, 1, 2048, 513, 513, 2048, 2048] + - [106, 12278.0] + - - [1024, 1025, 1, 1024, 1024, 1024, 1024, 1024] + - [82, 14485.0] + - - [512, 2048, 1, 2049, 512, 512, 2049, 2049] + - [96, 15833.0] + - - [1024, 1023, 1, 1024, 1024, 1024, 1024, 1024] + - [105, 14178.0] + - - [960, 1023, 1, 1024, 960, 960, 1024, 1024] + - [105, 13264.0] + - - [2048, 511, 1, 2048, 2048, 2048, 2048, 2048] + - [105, 14386.0] + - - [1023, 512, 1, 1024, 1023, 1023, 1024, 1024] + - [136, 12190.0] + - - [2047, 512, 1, 2048, 2047, 2047, 2048, 2048] + - [131, 14596.0] + - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] + - [136, 12573.0] + - - [512, 1024, 1, 1025, 512, 512, 1025, 1025] + - [73, 13469.0] + - - [512, 2047, 1, 2048, 512, 512, 2048, 2048] + - [87, 14509.0] + - - [512, 1025, 1, 1024, 512, 512, 1024, 1024] + - [87, 12326.0] + - - [512, 2048, 1, 2047, 512, 512, 2047, 2047] + - [121, 15855.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [82, 13797.0] + - - [961, 1024, 1, 1024, 961, 961, 1024, 1024] + - [105, 13603.0] + - - [512, 1024, 1, 1023, 512, 512, 1023, 1023] + - [73, 13429.0] + - - [1023, 1024, 1, 1024, 1023, 1023, 1024, 1024] + - [105, 14345.0] + - - [479, 1024, 1, 1024, 479, 479, 1024, 1024] + - [136, 11452.0] + - - [479, 2048, 1, 2048, 479, 479, 2048, 2048] + - [105, 13626.0] + - - [480, 1023, 1, 1024, 480, 480, 1024, 1024] + - [110, 11402.0] + - - [480, 1024, 1, 1023, 480, 480, 1023, 1023] + - [73, 12422.0] + - - [480, 1024, 1, 1025, 480, 480, 1025, 1025] + - [73, 12495.0] + - - [480, 1025, 1, 1024, 480, 480, 1024, 1024] + - [110, 11497.0] + - - [480, 2047, 1, 2048, 480, 480, 2048, 2048] + - [82, 14013.0] + - - [480, 2048, 1, 2047, 480, 480, 2047, 2047] + - [121, 14811.0] + - - [480, 2048, 1, 2049, 480, 480, 2049, 2049] + - [96, 14811.0] + - - [480, 2049, 1, 2048, 480, 480, 2048, 2048] + - [136, 13660.0] + - - [480, 3071, 1, 3072, 480, 480, 3072, 3072] + - [87, 14261.0] + - - [481, 1024, 1, 1024, 481, 481, 1024, 1024] + - [110, 11499.0] + - - [481, 2048, 1, 2048, 481, 481, 2048, 2048] + - [136, 13644.0] + - - [1023, 480, 1, 1024, 1023, 1023, 1024, 1024] + - [110, 11264.0] + - - [1024, 479, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 11302.0] + - - [1024, 480, 1, 1023, 1024, 1024, 1023, 1023] + - [73, 12409.0] + - - [1024, 480, 1, 1025, 1024, 1024, 1025, 1025] + - [73, 12397.0] + - - [1024, 481, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 11263.0] + - - [1025, 480, 1, 1024, 1025, 1025, 1024, 1024] + - [87, 11368.0] + - - [2047, 480, 1, 2048, 2047, 2047, 2048, 2048] + - [105, 13807.0] + - - [2048, 479, 1, 2048, 2048, 2048, 2048, 2048] + - [131, 13795.0] + - - [2048, 480, 1, 2047, 2048, 2048, 2047, 2047] + - [121, 14656.0] + - - [2048, 480, 1, 2049, 2048, 2048, 2049, 2049] + - [96, 14655.0] + - - [2048, 481, 1, 2048, 2048, 2048, 2048, 2048] + - [105, 13898.0] + - - [2049, 480, 1, 2048, 2049, 2049, 2048, 2048] + - [136, 13513.0] + - - [3071, 480, 1, 3072, 3071, 3071, 3072, 3072] + - [110, 14172.0] + - - [480, 1024, 1, 1024, 480, 480, 1024, 1024] + - [110, 11523.0] + - - [480, 2048, 1, 2048, 480, 480, 2048, 2048] + - [105, 13993.0] + - - [1024, 480, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 11275.0] + - - [2048, 480, 1, 2048, 2048, 2048, 2048, 2048] + - [136, 13605.0] + - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] + - [110, 13335.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [82, 13718.0] + - - [1024, 960, 1, 1600, 1024, 1024, 1600, 1600] + - [121, 14747.0] + - - [1024, 1024, 1, 960, 1024, 1024, 960, 960] + - [96, 15573.0] + - - [2048, 215, 1, 512, 2048, 2048, 512, 512] + - [103, 9255.0] + - - [2048, 215, 1, 768, 2048, 2048, 768, 768] + - [134, 9558.0] + - - [2048, 256, 1, 512, 2048, 2048, 512, 512] + - [131, 11403.0] + - - [2048, 256, 1, 768, 2048, 2048, 768, 768] + - [105, 12165.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [136, 14655.0] + - - [2048, 512, 1, 67, 2048, 2048, 67, 67] + - [71, 9418.0] + - - [2048, 512, 1, 74, 2048, 2048, 74, 74] + - [71, 9974.0] + - - [256, 1280, 1, 1024, 256, 256, 1024, 1024] + - [102, 8461.0] + - - [256, 1536, 1, 1024, 256, 256, 1024, 1024] + - [102, 10006.0] + - - [256, 2304, 1, 1024, 256, 256, 1024, 1024] + - [110, 14040.0] + - - [256, 2560, 1, 1024, 256, 256, 1024, 1024] + - [130, 10969.0] + - - [256, 2816, 1, 1024, 256, 256, 1024, 1024] + - [85, 12031.0] + - - [256, 3328, 1, 1024, 256, 256, 1024, 1024] + - [85, 14089.0] + - - [256, 3584, 1, 1024, 256, 256, 1024, 1024] + - [82, 13071.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [85, 13042.0] + - - [767, 1280, 1, 768, 767, 767, 768, 768] + - [82, 13200.0] + - - [769, 1280, 1, 768, 769, 769, 768, 768] + - [131, 13790.0] + - - [768, 1279, 1, 768, 768, 768, 768, 768] + - [82, 13277.0] + - - [768, 1281, 1, 768, 768, 768, 768, 768] + - [105, 13570.0] + - - [768, 1280, 1, 767, 768, 768, 767, 767] + - [96, 14324.0] + - - [768, 1280, 1, 769, 768, 768, 769, 769] + - [96, 14312.0] + - - [256, 4096, 1, 512, 256, 256, 512, 512] + - [82, 13945.0] + - - [767, 768, 1, 768, 767, 767, 768, 768] + - [110, 12992.0] + - - [769, 768, 1, 768, 769, 769, 768, 768] + - [130, 9651.0] + - - [768, 767, 1, 768, 768, 768, 768, 768] + - [87, 12758.0] + - - [768, 769, 1, 768, 768, 768, 768, 768] + - [134, 9589.0] + - - [768, 768, 1, 767, 768, 768, 767, 767] + - [118, 14509.0] + - - [768, 768, 1, 769, 768, 768, 769, 769] + - [118, 14519.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [136, 13562.0] + - - [128, 128, 49, 1152, 128, 128, 1152, 1152] + - [85, 13369.0] + - - [128, 128, 49, 1216, 128, 128, 1216, 1216] + - [95, 14588.0] + - - [128, 128, 36, 1800, 128, 128, 1800, 1800] + - [74, 15855.0] + - - [128, 128, 36, 1900, 128, 128, 1900, 1900] + - [96, 15722.0] + - - [128, 128, 64, 5880, 128, 128, 5880, 5880] + - [131, 12181.0] + - - [128, 128, 49, 7680, 128, 128, 7680, 7680] + - [113, 8896.0] + - - [128, 128, 64, 882, 128, 128, 882, 882] + - [74, 14975.0] + - - [128, 128, 64, 931, 128, 128, 931, 931] + - [96, 15273.0] + - - [128, 64, 121, 1152, 128, 128, 1152, 1152] + - [85, 12756.0] + - - [128, 64, 81, 12000, 128, 128, 12000, 12000] + - [72, 7613.0] + - - [128, 64, 121, 1216, 128, 128, 1216, 1216] + - [125, 12504.0] + - - [128, 64, 81, 1800, 128, 128, 1800, 1800] + - [134, 11158.0] + - - [128, 64, 81, 1900, 128, 128, 1900, 1900] + - [95, 12276.0] + - - [128, 64, 49, 20280, 128, 128, 20280, 20280] + - [117, 8304.0] + - - [128, 64, 49, 3042, 128, 128, 3042, 3042] + - [117, 10681.0] + - - [128, 64, 49, 3211, 128, 128, 3211, 3211] + - [118, 10777.0] + - - [128, 64, 169, 5880, 128, 128, 5880, 5880] + - [130, 8133.0] + - - [128, 64, 121, 7680, 128, 128, 7680, 7680] + - [138, 7136.0] + - - [128, 64, 169, 882, 128, 128, 882, 882] + - [98, 10155.0] + - - [128, 64, 169, 931, 128, 128, 931, 931] + - [127, 10251.0] + - - [256, 128, 25, 1080, 256, 256, 1080, 1080] + - [118, 14785.0] + - - [256, 128, 25, 162, 256, 256, 162, 162] + - [73, 11115.0] + - - [256, 128, 25, 171, 256, 256, 171, 171] + - [118, 11171.0] + - - [1152, 256, 1, 1, 1152, 1152, 1, 1] + - [69, 166.0] + - - [1152, 256, 1, 1444, 1152, 1152, 1444, 1444] + - [72, 11466.0] + - - [1152, 256, 1, 25, 1152, 1152, 25, 25] + - [69, 2903.0] + - - [1152, 256, 1, 9, 1152, 1152, 9, 9] + - [69, 1252.0] + - - [2304, 256, 1, 1444, 2304, 2304, 1444, 1444] + - [96, 15480.0] + - - [2304, 340, 1, 1, 2304, 2304, 1, 1] + - [128, 280.0] + - - [2304, 340, 1, 1444, 2304, 2304, 1444, 1444] + - [121, 11635.0] + - - [2304, 340, 1, 9, 2304, 2304, 9, 9] + - [114, 2136.0] + - - [2304, 510, 1, 25, 2304, 2304, 25, 25] + - [114, 5190.0] + - - [30522, 77, 1, 1024, 30522, 30522, 1024, 1024] + - [105, 10059.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 12937.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 13319.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 13635.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 9298.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 10960.0] + - - [64, 512, 256, 512, 64, 64, 512, 512] + - [88, 9101.0] + - - [64, 512, 128, 512, 64, 64, 512, 512] + - [135, 10313.0] + - - [64, 512, 40, 512, 64, 64, 512, 512] + - [135, 10174.0] + - - [96, 1024, 64, 1024, 96, 96, 1024, 1024] + - [85, 10315.0] + - - [96, 1024, 128, 1024, 96, 96, 1024, 1024] + - [108, 10529.0] + - - [64, 1024, 256, 1024, 64, 64, 1024, 1024] + - [111, 9349.0] + - - [64, 1024, 32, 1024, 64, 64, 1024, 1024] + - [86, 10015.0] + - - [64, 1024, 64, 1024, 64, 64, 1024, 1024] + - [88, 8976.0] + - - [64, 1024, 128, 1024, 64, 64, 1024, 1024] + - [137, 9336.0] + - - [64, 128, 1024, 128, 64, 64, 128, 128] + - [135, 11307.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [82, 12038.0] + - - [1024, 864, 1, 480, 1024, 1024, 480, 480] + - [96, 12175.0] + - - [128, 3456, 1, 256, 128, 128, 256, 256] + - [105, 8401.0] + - - [128, 4096, 1, 256, 128, 128, 256, 256] + - [85, 9684.0] + - - [128, 6912, 1, 256, 128, 128, 256, 256] + - [82, 11474.0] + - - [256, 3456, 1, 512, 256, 256, 512, 512] + - [82, 12080.0] + - - [512, 864, 1, 1024, 512, 512, 1024, 1024] + - [136, 10394.0] + - - [512, 864, 1, 13, 512, 512, 13, 13] + - [114, 2162.0] + - - [64, 128, 1280, 128, 64, 64, 128, 128] + - [137, 6718.0] + - - [64, 128, 1312, 128, 64, 64, 128, 128] + - [89, 6508.0] + - - [64, 512, 192, 512, 64, 64, 512, 512] + - [137, 9082.0] + - - [1024, 512, 1, 196, 1024, 1024, 196, 196] + - [73, 9126.0] + - - [64, 128, 2048, 128, 64, 64, 128, 128] + - [112, 5891.0] + - - [64, 128, 1536, 128, 64, 64, 128, 128] + - [139, 5940.0] + - - [128, 128, 64, 6400, 128, 128, 6400, 6400] + - [113, 11551.0] + - - [64, 128, 192, 128, 64, 64, 128, 128] + - [109, 9160.0] + - - [64, 384, 144, 384, 64, 64, 384, 384] + - [132, 12794.0] + - - [64, 512, 48, 512, 64, 64, 512, 512] + - [86, 10274.0] + - - [64, 128, 256, 128, 64, 64, 128, 128] + - [107, 9957.0] + - - [64, 384, 192, 384, 64, 64, 384, 384] + - [86, 11012.0] + - - [128, 128, 49, 1120, 128, 128, 1120, 1120] + - [95, 14313.0] + - - [128, 128, 49, 1064, 128, 128, 1064, 1064] + - [95, 14617.0] + - - [128, 128, 49, 1040, 128, 128, 1040, 1040] + - [73, 14602.0] + - - [128, 128, 64, 600, 128, 128, 600, 600] + - [121, 15095.0] + - - [128, 128, 64, 616, 128, 128, 616, 616] + - [74, 15085.0] + - - [128, 128, 49, 950, 128, 128, 950, 950] + - [95, 14412.0] + - - [128, 128, 49, 972, 128, 128, 972, 972] + - [118, 14445.0] + - - [128, 128, 64, 560, 128, 128, 560, 560] + - [96, 14980.0] + - - [128, 128, 49, 1008, 128, 128, 1008, 1008] + - [95, 14586.0] + - - [128, 128, 64, 532, 128, 128, 532, 532] + - [74, 14734.0] + - - [128, 128, 49, 1080, 128, 128, 1080, 1080] + - [95, 14626.0] + - - [128, 128, 64, 588, 128, 128, 588, 588] + - [121, 14864.0] + - - [128, 128, 49, 1160, 128, 128, 1160, 1160] + - [95, 14652.0] + - - [128, 128, 49, 988, 128, 128, 988, 988] + - [118, 14458.0] + - - [128, 128, 49, 936, 128, 128, 936, 936] + - [73, 14518.0] + - - [512, 1024, 1, 3800, 512, 512, 3800, 3800] + - [74, 14621.0] + - - [512, 1024, 1, 3400, 512, 512, 3400, 3400] + - [121, 14578.0] + - - [512, 1024, 1, 3456, 512, 512, 3456, 3456] + - [126, 13677.0] + - - [2048, 512, 1, 950, 2048, 2048, 950, 950] + - [96, 15521.0] + - - [512, 1024, 1, 3552, 512, 512, 3552, 3552] + - [96, 14608.0] + - - [512, 1024, 1, 3220, 512, 512, 3220, 3220] + - [96, 14479.0] + - - [2048, 512, 1, 850, 2048, 2048, 850, 850] + - [96, 15410.0] + - - [512, 2048, 1, 864, 512, 512, 864, 864] + - [121, 15577.0] + - - [512, 2048, 1, 768, 512, 512, 768, 768] + - [136, 14258.0] + - - [2048, 512, 1, 805, 2048, 2048, 805, 805] + - [96, 15381.0] + - - [512, 1024, 1, 2852, 512, 512, 2852, 2852] + - [121, 14427.0] + - - [512, 2048, 1, 888, 512, 512, 888, 888] + - [121, 15628.0] + - - [2048, 512, 1, 864, 2048, 2048, 864, 864] + - [96, 15566.0] + - - [2048, 512, 1, 888, 2048, 2048, 888, 888] + - [96, 15581.0] + - - [2048, 256, 1, 950, 2048, 2048, 950, 950] + - [73, 13310.0] + - - [2048, 512, 1, 713, 2048, 2048, 713, 713] + - [96, 15245.0] + - - [512, 1024, 1, 2688, 512, 512, 2688, 2688] + - [126, 13540.0] + - - [512, 1024, 1, 2640, 512, 512, 2640, 2640] + - [121, 14490.0] + - - [512, 1024, 1, 2904, 512, 512, 2904, 2904] + - [96, 14511.0] + - - [1024, 512, 1, 950, 1024, 1024, 950, 950] + - [96, 13432.0] + - - [512, 2048, 1, 672, 512, 512, 672, 672] + - [96, 15439.0] + - - [512, 2048, 1, 660, 512, 512, 660, 660] + - [96, 15271.0] + - - [512, 2048, 1, 1008, 512, 512, 1008, 1008] + - [96, 15682.0] + - - [2048, 256, 1, 850, 2048, 2048, 850, 850] + - [73, 13271.0] + - - [2048, 512, 1, 726, 2048, 2048, 726, 726] + - [121, 15299.0] + - - [1024, 512, 1, 850, 1024, 1024, 850, 850] + - [73, 13335.0] + - - [2048, 512, 1, 660, 2048, 2048, 660, 660] + - [121, 15170.0] + - - [2048, 512, 1, 672, 2048, 2048, 672, 672] + - [96, 15405.0] + - - [512, 2048, 1, 840, 512, 512, 840, 840] + - [121, 15529.0] + - - [2048, 512, 1, 1008, 2048, 2048, 1008, 1008] + - [96, 15677.0] + - - [512, 2048, 1, 792, 512, 512, 792, 792] + - [121, 15534.0] + - - [1024, 512, 1, 805, 1024, 1024, 805, 805] + - [73, 13214.0] + - - [512, 2048, 1, 1050, 512, 512, 1050, 1050] + - [121, 15613.0] + - - [2048, 512, 1, 748, 2048, 2048, 748, 748] + - [96, 15373.0] + - - [2048, 256, 1, 864, 2048, 2048, 864, 864] + - [74, 13394.0] + - - [1024, 512, 1, 864, 1024, 1024, 864, 864] + - [96, 13546.0] + - - [2048, 512, 1, 875, 2048, 2048, 875, 875] + - [96, 15446.0] + - - [2048, 512, 1, 840, 2048, 2048, 840, 840] + - [96, 15507.0] + - - [2048, 512, 1, 792, 2048, 2048, 792, 792] + - [96, 15494.0] + - - [512, 2048, 1, 736, 512, 512, 736, 736] + - [96, 15478.0] + - - [2048, 256, 1, 888, 2048, 2048, 888, 888] + - [73, 13401.0] + - - [512, 2048, 1, 704, 512, 512, 704, 704] + - [121, 15405.0] + - - [512, 2048, 1, 588, 512, 512, 588, 588] + - [96, 15149.0] + - - [1024, 512, 1, 888, 1024, 1024, 888, 888] + - [121, 13526.0] + - - [512, 2048, 1, 816, 512, 512, 816, 816] + - [121, 15546.0] + - - [1024, 512, 1, 713, 1024, 1024, 713, 713] + - [73, 13089.0] + - - [2048, 512, 1, 736, 2048, 2048, 736, 736] + - [121, 15417.0] + - - [2048, 512, 1, 588, 2048, 2048, 588, 588] + - [96, 15126.0] + - - [2048, 512, 1, 704, 2048, 2048, 704, 704] + - [96, 15360.0] + - - [1024, 512, 1, 660, 1024, 1024, 660, 660] + - [73, 12999.0] + - - [2048, 256, 1, 660, 2048, 2048, 660, 660] + - [73, 12931.0] + - - [2048, 256, 1, 672, 2048, 2048, 672, 672] + - [73, 13088.0] + - - [1024, 512, 1, 672, 1024, 1024, 672, 672] + - [73, 13117.0] + - - [1024, 512, 1, 726, 1024, 1024, 726, 726] + - [73, 13080.0] + - - [512, 2048, 1, 630, 512, 512, 630, 630] + - [96, 15214.0] + - - [512, 2048, 1, 600, 512, 512, 600, 600] + - [121, 15323.0] + - - [2048, 256, 1, 805, 2048, 2048, 805, 805] + - [73, 13164.0] + - - [2048, 256, 1, 713, 2048, 2048, 713, 713] + - [73, 13052.0] + - - [2048, 256, 1, 726, 2048, 2048, 726, 726] + - [73, 13018.0] + - - [320, 1024, 1, 1024, 320, 320, 1024, 1024] + - [129, 8093.0] + - - [1024, 1000, 1, 1024, 1024, 1024, 1024, 1024] + - [105, 14030.0] + - - [320, 1000, 1, 1024, 320, 320, 1024, 1024] + - [102, 8051.0] + - - [128, 128, 49, 1280, 128, 128, 1280, 1280] + - [85, 13294.0] + - - [128, 128, 49, 1360, 128, 128, 1360, 1360] + - [95, 14738.0] + - - [128, 128, 49, 1200, 128, 128, 1200, 1200] + - [118, 14681.0] + - - [128, 128, 49, 1240, 128, 128, 1240, 1240] + - [95, 14683.0] + - - [2304, 256, 1, 704, 2304, 2304, 704, 704] + - [74, 14559.0] + - - [2304, 256, 1, 736, 2304, 2304, 736, 736] + - [73, 14666.0] + - - [2304, 256, 1, 792, 2304, 2304, 792, 792] + - [74, 14792.0] + - - [2304, 256, 1, 748, 2304, 2304, 748, 748] + - [95, 14513.0] + - - [2304, 256, 1, 726, 2304, 2304, 726, 726] + - [73, 14476.0] + - - [2304, 256, 1, 713, 2304, 2304, 713, 713] + - [73, 14432.0] + - - [2304, 256, 1, 768, 2304, 2304, 768, 768] + - [87, 13386.0] + - - [512, 2048, 1, 759, 512, 512, 759, 759] + - [121, 15370.0] + - - [512, 2048, 1, 925, 512, 512, 925, 925] + - [96, 15514.0] + - - [2304, 256, 1, 805, 2304, 2304, 805, 805] + - [73, 14654.0] + - - [512, 2048, 1, 900, 512, 512, 900, 900] + - [121, 15506.0] + - - [512, 2048, 1, 875, 512, 512, 875, 875] + - [96, 15509.0] + - - [512, 2048, 1, 748, 512, 512, 748, 748] + - [96, 15385.0] + - - [512, 2048, 1, 726, 512, 512, 726, 726] + - [121, 15299.0] + - - [512, 2048, 1, 713, 512, 512, 713, 713] + - [96, 15308.0] + - - [512, 2048, 1, 805, 512, 512, 805, 805] + - [96, 15415.0] + - - [512, 2048, 1, 850, 512, 512, 850, 850] + - [121, 15415.0] + - - [512, 2048, 1, 950, 512, 512, 950, 950] + - [96, 15526.0] + - - [96, 1024, 160, 1024, 96, 96, 1024, 1024] + - [85, 10609.0] + - - [96, 1024, 40, 1024, 96, 96, 1024, 1024] + - [85, 11024.0] + - - [96, 1024, 80, 1024, 96, 96, 1024, 1024] + - [85, 10318.0] + - - [96, 1024, 96, 1024, 96, 96, 1024, 1024] + - [87, 10564.0] + - - [96, 1024, 24, 1024, 96, 96, 1024, 1024] + - [87, 11412.0] + - - [96, 1024, 48, 1024, 96, 96, 1024, 1024] + - [134, 10418.0] + - - [96, 1024, 16, 1024, 96, 96, 1024, 1024] + - [110, 11230.0] + - - [96, 1024, 32, 1024, 96, 96, 1024, 1024] + - [136, 11261.0] + - - [64, 512, 320, 512, 64, 64, 512, 512] + - [88, 9201.0] + - - [64, 512, 80, 512, 64, 64, 512, 512] + - [86, 11479.0] + - - [29000, 109, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 12021.0] + - - [29000, 121, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 13795.0] + - - [29000, 65, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 7705.0] + - - [29000, 66, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 7883.0] + - - [29000, 67, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 7896.0] + - - [29000, 69, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 8207.0] + - - [29000, 70, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 8364.0] + - - [29000, 71, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 8430.0] + - - [29000, 73, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 8678.0] + - - [29000, 74, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 8667.0] + - - [29000, 75, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 8726.0] + - - [29000, 77, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 9129.0] + - - [29000, 78, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 9207.0] + - - [29000, 80, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 9351.0] + - - [29000, 81, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 9610.0] + - - [29000, 82, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 9511.0] + - - [29000, 83, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 9842.0] + - - [29000, 84, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 9845.0] + - - [29000, 88, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 10310.0] + - - [29000, 89, 1, 2560, 29000, 29000, 2560, 2560] + - [110, 10442.0] + - - [29000, 90, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 10440.0] + - - [29000, 92, 1, 2560, 29000, 29000, 2560, 2560] + - [87, 10841.0] + - - [29000, 95, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 10792.0] + - - [29000, 98, 1, 2560, 29000, 29000, 2560, 2560] + - [136, 11061.0] + - - [64, 1024, 512, 1024, 64, 64, 1024, 1024] + - [111, 9477.0] + - - [1024, 200, 1, 13312, 1024, 1024, 13312, 13312] + - [147, 10643.0] + - - [1024, 256, 1, 15360, 1024, 1024, 15360, 15360] + - [147, 13638.0] + - - [1024, 256, 1, 16384, 1024, 1024, 16384, 16384] + - [148, 12168.0] + - - [1024, 200, 1, 16384, 1024, 1024, 16384, 16384] + - [147, 9927.0] + - - [1024, 256, 1, 12288, 1024, 1024, 12288, 12288] + - [147, 13355.0] + - - [1024, 200, 1, 12288, 1024, 1024, 12288, 12288] + - [147, 10522.0] + - - [1024, 200, 1, 15360, 1024, 1024, 15360, 15360] + - [147, 10826.0] + - - [1024, 256, 1, 9216, 1024, 1024, 9216, 9216] + - [147, 13020.0] + - - [1024, 200, 1, 14336, 1024, 1024, 14336, 14336] + - [147, 10756.0] + - - [1024, 256, 1, 16640, 1024, 1024, 16640, 16640] + - [156, 13813.0] + - - [1024, 200, 1, 8192, 1024, 1024, 8192, 8192] + - [147, 10005.0] + - - [1024, 200, 1, 10240, 1024, 1024, 10240, 10240] + - [147, 10375.0] + - - [1024, 200, 1, 9216, 1024, 1024, 9216, 9216] + - [147, 10326.0] + - - [1024, 256, 1, 11264, 1024, 1024, 11264, 11264] + - [147, 13412.0] + - - [1024, 200, 1, 8320, 1024, 1024, 8320, 8320] + - [147, 10192.0] + - - [1024, 256, 1, 8320, 1024, 1024, 8320, 8320] + - [162, 12857.0] + - - [1024, 200, 1, 16640, 1024, 1024, 16640, 16640] + - [162, 10938.0] + - - [1024, 256, 1, 14336, 1024, 1024, 14336, 14336] + - [147, 13574.0] + - - [1024, 256, 1, 13312, 1024, 1024, 13312, 13312] + - [147, 13462.0] + - - [1024, 200, 1, 11264, 1024, 1024, 11264, 11264] + - [147, 10551.0] + - - [1024, 256, 1, 8192, 1024, 1024, 8192, 8192] + - [147, 12042.0] + - - [1024, 256, 1, 10240, 1024, 1024, 10240, 10240] + - [147, 13089.0] + - - [96, 64, 64, 18432, 96, 96, 18432, 18432] + - [138, 6262.0] + - - [96, 64, 36, 10368, 96, 96, 10368, 10368] + - [84, 7266.0] + - - [96, 64, 36, 20736, 96, 96, 20736, 20736] + - [154, 7250.0] + - - [96, 96, 36, 10368, 96, 96, 10368, 10368] + - [87, 7967.0] + - - [96, 64, 49, 28800, 96, 96, 28800, 28800] + - [168, 7118.0] + - - [96, 64, 36, 41472, 96, 96, 41472, 41472] + - [168, 7188.0] + - - [64, 64, 11, 233600, 64, 64, 233600, 233600] + - [167, 5423.0] + - - [64, 64, 11, 116800, 64, 64, 116800, 116800] + - [153, 5606.0] + - - [64, 64, 9, 172864, 64, 64, 172864, 172864] + - [143, 6425.0] + - - [64, 64, 11, 58400, 64, 64, 58400, 58400] + - [164, 6140.0] + - - [192, 160, 9, 19584, 192, 192, 19584, 19584] + - [160, 8667.0] + - - [128, 128, 9, 9792, 128, 128, 9792, 9792] + - [149, 12771.0] + - - [192, 160, 11, 13056, 192, 192, 13056, 13056] + - [154, 8492.0] + - - [64, 64, 9, 86432, 64, 64, 86432, 86432] + - [151, 6229.0] + - - [128, 128, 9, 19584, 128, 128, 19584, 19584] + - [148, 11710.0] + - - [160, 160, 11, 13056, 160, 160, 13056, 13056] + - [165, 7364.0] + - - [160, 160, 9, 19584, 160, 160, 19584, 19584] + - [150, 7991.0] + - - [192, 128, 9, 19584, 192, 192, 19584, 19584] + - [162, 10825.0] + - - [192, 160, 9, 9792, 192, 192, 9792, 9792] + - [158, 10327.0] + - - [64, 64, 9, 345728, 64, 64, 345728, 345728] + - [157, 6334.0] + - - [128, 128, 11, 13056, 128, 128, 13056, 13056] + - [146, 10435.0] + - - [160, 160, 9, 9792, 160, 160, 9792, 9792] + - [158, 8727.0] + - - [192, 128, 11, 13056, 192, 192, 13056, 13056] + - [155, 9716.0] + - - [192, 128, 9, 9792, 192, 192, 9792, 9792] + - [159, 11936.0] + - - [128, 64, 25, 43320, 128, 128, 43320, 43320] + - [161, 8160.0] + - - [64, 64, 64, 20280, 64, 64, 20280, 20280] + - [119, 6295.0] + - - [64, 64, 49, 27000, 64, 64, 27000, 27000] + - [97, 6188.0] + - - [64, 64, 36, 43320, 64, 64, 43320, 43320] + - [163, 6364.0] + - - [64, 64, 36, 50176, 64, 64, 50176, 50176] + - [157, 6222.0] + - - [64, 64, 49, 36864, 64, 64, 36864, 36864] + - [88, 6004.0] + - - [64, 64, 64, 25600, 64, 64, 25600, 25600] + - [137, 5973.0] + - - [256, 256, 1, 60800, 256, 256, 60800, 60800] + - [150, 9833.0] + - - [256, 256, 1, 54400, 256, 256, 54400, 54400] + - [144, 9832.0] + - - [256, 256, 1, 51520, 256, 256, 51520, 51520] + - [141, 10206.0] + - - [256, 256, 1, 55296, 256, 256, 55296, 55296] + - [154, 9820.0] + - - [256, 256, 1, 56832, 256, 256, 56832, 56832] + - [146, 9862.0] + - - [256, 256, 1, 45632, 256, 256, 45632, 45632] + - [141, 10142.0] + - - [256, 256, 1, 49152, 256, 256, 49152, 49152] + - [145, 8932.0] + - - [256, 512, 1, 13600, 256, 256, 13600, 13600] + - [149, 12589.0] + - - [256, 256, 1, 43008, 256, 256, 43008, 43008] + - [146, 9702.0] + - - [256, 512, 1, 15200, 256, 256, 15200, 15200] + - [152, 12753.0] + - - [256, 512, 1, 12880, 256, 256, 12880, 12880] + - [142, 12408.0] + - - [256, 512, 1, 13824, 256, 256, 13824, 13824] + - [162, 11737.0] + - - [512, 256, 1, 13824, 512, 512, 13824, 13824] + - [166, 11582.0] + - - [256, 512, 1, 14208, 256, 256, 14208, 14208] + - [156, 11921.0] + - - [512, 256, 1, 14208, 512, 512, 14208, 14208] + - [166, 11851.0] + - - [512, 256, 1, 15200, 512, 512, 15200, 15200] + - [142, 12771.0] + - - [256, 512, 1, 12288, 256, 256, 12288, 12288] + - [156, 11400.0] + - - [512, 256, 1, 12288, 512, 512, 12288, 12288] + - [147, 11331.0] + - - [1024, 200, 1, 560, 1024, 1024, 560, 560] + - [191, 4522.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [215, 5685.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [215, 4647.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [213, 3342.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [197, 4107.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [215, 3525.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [179, 4659.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [197, 4884.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [197, 4369.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [181, 4758.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [183, 4544.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [203, 3671.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [199, 6472.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [215, 5484.0] + - - [704, 256, 1, 128, 704, 704, 128, 128] + - [213, 3697.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [181, 3782.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [213, 4662.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [213, 4862.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [183, 5383.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [199, 4737.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [199, 5519.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [215, 5124.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [179, 3720.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [199, 4683.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [181, 3424.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [197, 5313.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [211, 3789.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [199, 5676.0] + - - [35, 8457, 1, 1760, 35, 35, 1760, 1760] + - [174, 3573.0] + - - [64, 2944, 1, 128, 64, 64, 128, 128] + - [181, 3954.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [181, 5165.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [213, 4741.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [183, 5567.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [213, 4373.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [199, 3745.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [199, 4512.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [211, 3301.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [181, 4296.0] + - - [35, 8457, 1, 2560, 35, 35, 2560, 2560] + - [215, 3573.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 128] + - [199, 5151.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [174, 4695.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [204, 4030.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [197, 3553.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [188, 3756.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [199, 6389.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [181, 3709.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [183, 5128.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 128] + - [181, 3840.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [199, 5313.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [199, 2973.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [183, 4613.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [199, 5573.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [199, 4885.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 128] + - [199, 4044.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [199, 6130.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [215, 5336.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [183, 6487.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [199, 3859.0] + - - [35, 8457, 1, 2048, 35, 35, 2048, 2048] + - [183, 3568.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [197, 4248.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 128] + - [181, 3159.0] + - - [256, 1024, 1, 128, 256, 256, 128, 128] + - [215, 4877.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [213, 4885.0] + - - [35, 8457, 1, 4096, 35, 35, 4096, 4096] + - [199, 3456.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [213, 4173.0] + - - [448, 256, 1, 128, 448, 448, 128, 128] + - [199, 3021.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [179, 3735.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [197, 4202.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [189, 4910.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [178, 2465.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [179, 3840.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [215, 4582.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [197, 4776.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [204, 4150.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 128] + - [181, 3065.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [183, 6184.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [174, 5331.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [178, 2486.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [197, 4927.0] + - - [256, 448, 1, 128, 256, 256, 128, 128] + - [199, 2948.0] + - - [64, 3584, 1, 128, 64, 64, 128, 128] + - [199, 4369.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [172, 5041.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [215, 4122.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [197, 4948.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [197, 4919.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [178, 2884.0] + - - [64, 1856, 1, 128, 64, 64, 128, 128] + - [181, 3116.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [181, 3626.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [183, 4498.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [197, 4045.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [183, 5787.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [215, 6410.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [188, 3700.0] + - - [64, 1408, 1, 128, 64, 64, 128, 128] + - [169, 2444.0] + - - [256, 704, 1, 128, 256, 256, 128, 128] + - [181, 3697.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 128] + - [178, 2444.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [197, 5314.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [183, 4215.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [215, 5458.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [183, 6208.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [183, 4748.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [199, 3784.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [179, 5040.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [179, 2958.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [199, 5395.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [199, 4751.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [215, 4793.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [199, 6174.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [197, 4881.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [211, 3945.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [183, 4993.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [199, 5825.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [199, 3763.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [197, 4826.0] + - - [64, 2368, 1, 128, 64, 64, 128, 128] + - [181, 3201.0] + - - [64, 4288, 1, 128, 64, 64, 128, 128] + - [199, 5212.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [199, 5094.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [197, 5105.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [179, 2980.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [197, 5069.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [213, 4812.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [183, 5655.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [191, 4641.0] + - - [448, 448, 1, 128, 448, 448, 128, 128] + - [181, 4091.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [213, 4202.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [191, 5243.0] + - - [1024, 256, 1, 1536, 1024, 1024, 1536, 1536] + - [215, 6119.0] + - - [1024, 200, 1, 1408, 1024, 1024, 1408, 1408] + - [191, 4833.0] + - - [1024, 200, 1, 6144, 1024, 1024, 6144, 6144] + - [183, 4885.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [199, 6247.0] + - - [512, 256, 1, 3200, 512, 512, 3200, 3200] + - [174, 5501.0] + - - [1024, 200, 1, 4608, 1024, 1024, 4608, 4608] + - [199, 4909.0] + - - [512, 256, 1, 1792, 512, 512, 1792, 1792] + - [215, 5227.0] + - - [1024, 200, 1, 1792, 1024, 1024, 1792, 1792] + - [199, 4855.0] + - - [512, 200, 1, 2816, 512, 512, 2816, 2816] + - [174, 4229.0] + - - [512, 200, 1, 3072, 512, 512, 3072, 3072] + - [215, 4181.0] + - - [1024, 200, 1, 128, 1024, 1024, 128, 128] + - [191, 3844.0] + - - [1024, 200, 1, 5120, 1024, 1024, 5120, 5120] + - [215, 4902.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [199, 5492.0] + - - [512, 256, 1, 2560, 512, 512, 2560, 2560] + - [215, 5245.0] + - - [1024, 256, 1, 4160, 1024, 1024, 4160, 4160] + - [191, 6304.0] + - - [1024, 200, 1, 512, 1024, 1024, 512, 512] + - [215, 4632.0] + - - [512, 512, 1, 1536, 512, 512, 1536, 1536] + - [199, 6155.0] + - - [1024, 256, 1, 896, 1024, 1024, 896, 896] + - [215, 6076.0] + - - [1024, 200, 1, 3200, 1024, 1024, 3200, 3200] + - [215, 4903.0] + - - [1024, 200, 1, 1536, 1024, 1024, 1536, 1536] + - [215, 4829.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [215, 6049.0] + - - [128, 1024, 1, 512, 128, 128, 512, 512] + - [183, 4800.0] + - - [1024, 256, 1, 5120, 1024, 1024, 5120, 5120] + - [183, 6189.0] + - - [1024, 200, 1, 2304, 1024, 1024, 2304, 2304] + - [199, 4874.0] + - - [1024, 256, 1, 1664, 1024, 1024, 1664, 1664] + - [199, 6198.0] + - - [512, 512, 1, 1024, 512, 512, 1024, 1024] + - [199, 6081.0] + - - [1024, 256, 1, 2080, 1024, 1024, 2080, 2080] + - [191, 6246.0] + - - [512, 200, 1, 768, 512, 512, 768, 768] + - [183, 3952.0] + - - [1024, 256, 1, 2816, 1024, 1024, 2816, 2816] + - [199, 6237.0] + - - [1024, 200, 1, 64, 1024, 1024, 64, 64] + - [191, 3197.0] + - - [512, 512, 1, 2304, 512, 512, 2304, 2304] + - [199, 6178.0] + - - [128, 1024, 1, 2048, 128, 128, 2048, 2048] + - [183, 5222.0] + - - [512, 200, 1, 2560, 512, 512, 2560, 2560] + - [183, 4196.0] + - - [512, 256, 1, 1024, 512, 512, 1024, 1024] + - [183, 5194.0] + - - [1024, 256, 1, 1920, 1024, 1024, 1920, 1920] + - [199, 6205.0] + - - [512, 200, 1, 2304, 512, 512, 2304, 2304] + - [174, 4160.0] + - - [1024, 256, 1, 384, 1024, 1024, 384, 384] + - [199, 5792.0] + - - [1024, 256, 1, 32, 1024, 1024, 32, 32] + - [191, 2933.0] + - - [1024, 200, 1, 2816, 1024, 1024, 2816, 2816] + - [215, 4896.0] + - - [1024, 200, 1, 3072, 1024, 1024, 3072, 3072] + - [199, 4876.0] + - - [512, 256, 1, 1536, 512, 512, 1536, 1536] + - [183, 5210.0] + - - [1024, 256, 1, 512, 1024, 1024, 512, 512] + - [215, 5871.0] + - - [256, 512, 1, 512, 256, 256, 512, 512] + - [199, 4849.0] + - - [1024, 200, 1, 3840, 1024, 1024, 3840, 3840] + - [199, 4862.0] + - - [256, 1024, 1, 512, 256, 256, 512, 512] + - [199, 5928.0] + - - [1024, 256, 1, 1152, 1024, 1024, 1152, 1152] + - [199, 6136.0] + - - [512, 512, 1, 2816, 512, 512, 2816, 2816] + - [199, 6201.0] + - - [512, 200, 1, 1280, 512, 512, 1280, 1280] + - [199, 4122.0] + - - [512, 200, 1, 3200, 512, 512, 3200, 3200] + - [191, 4272.0] + - - [1024, 256, 1, 2304, 1024, 1024, 2304, 2304] + - [199, 6213.0] + - - [1024, 256, 1, 6144, 1024, 1024, 6144, 6144] + - [183, 6225.0] + - - [1024, 200, 1, 2560, 1024, 1024, 2560, 2560] + - [215, 4883.0] + - - [1024, 256, 1, 5632, 1024, 1024, 5632, 5632] + - [215, 6232.0] + - - [512, 256, 1, 768, 512, 512, 768, 768] + - [199, 5084.0] + - - [1024, 256, 1, 3072, 1024, 1024, 3072, 3072] + - [215, 6199.0] + - - [256, 512, 1, 2048, 256, 256, 2048, 2048] + - [183, 5225.0] + - - [1024, 200, 1, 1152, 1024, 1024, 1152, 1152] + - [215, 4813.0] + - - [512, 512, 1, 3072, 512, 512, 3072, 3072] + - [183, 6217.0] + - - [1024, 200, 1, 1664, 1024, 1024, 1664, 1664] + - [215, 4853.0] + - - [1024, 200, 1, 32, 1024, 1024, 32, 32] + - [191, 2156.0] + - - [1024, 200, 1, 384, 1024, 1024, 384, 384] + - [215, 4551.0] + - - [512, 256, 1, 2304, 512, 512, 2304, 2304] + - [215, 5278.0] + - - [256, 512, 1, 1024, 256, 256, 1024, 1024] + - [183, 5162.0] + - - [1024, 200, 1, 3328, 1024, 1024, 3328, 3328] + - [215, 4904.0] + - - [1024, 200, 1, 2080, 1024, 1024, 2080, 2080] + - [191, 4891.0] + - - [512, 200, 1, 1792, 512, 512, 1792, 1792] + - [174, 4129.0] + - - [1024, 256, 1, 1792, 1024, 1024, 1792, 1792] + - [199, 6186.0] + - - [1024, 200, 1, 7168, 1024, 1024, 7168, 7168] + - [183, 4893.0] + - - [512, 256, 1, 3072, 512, 512, 3072, 3072] + - [215, 5246.0] + - - [1024, 200, 1, 2048, 1024, 1024, 2048, 2048] + - [215, 4856.0] + - - [512, 512, 1, 1280, 512, 512, 1280, 1280] + - [199, 6130.0] + - - [1024, 200, 1, 1280, 1024, 1024, 1280, 1280] + - [215, 4805.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [183, 3745.0] + - - [1024, 256, 1, 2560, 1024, 1024, 2560, 2560] + - [199, 6198.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [183, 4747.0] + - - [1024, 256, 1, 3200, 1024, 1024, 3200, 3200] + - [199, 6265.0] + - - [512, 512, 1, 2560, 512, 512, 2560, 2560] + - [199, 6211.0] + - - [1024, 256, 1, 640, 1024, 1024, 640, 640] + - [199, 5983.0] + - - [1024, 256, 1, 3584, 1024, 1024, 3584, 3584] + - [199, 6220.0] + - - [512, 512, 1, 3200, 512, 512, 3200, 3200] + - [191, 6270.0] + - - [1024, 256, 1, 7680, 1024, 1024, 7680, 7680] + - [215, 6126.0] + - - [512, 200, 1, 1536, 512, 512, 1536, 1536] + - [183, 4109.0] + - - [512, 256, 1, 2816, 512, 512, 2816, 2816] + - [215, 5318.0] + - - [1024, 200, 1, 768, 1024, 1024, 768, 768] + - [215, 4740.0] + - - [512, 200, 1, 2048, 512, 512, 2048, 2048] + - [183, 4119.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 128] + - [215, 4835.0] + - - [1024, 200, 1, 4096, 1024, 1024, 4096, 4096] + - [215, 4889.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [199, 6128.0] + - - [1024, 200, 1, 896, 1024, 1024, 896, 896] + - [199, 4771.0] + - - [1024, 256, 1, 4608, 1024, 1024, 4608, 4608] + - [215, 6205.0] + - - [128, 1024, 1, 1024, 128, 128, 1024, 1024] + - [183, 5111.0] + - - [1024, 256, 1, 2048, 1024, 1024, 2048, 2048] + - [199, 6172.0] + - - [512, 256, 1, 1280, 512, 512, 1280, 1280] + - [183, 5223.0] + - - [256, 1024, 1, 2048, 256, 256, 2048, 2048] + - [183, 6175.0] + - - [512, 512, 1, 2048, 512, 512, 2048, 2048] + - [183, 6164.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [199, 4920.0] + - - [1024, 200, 1, 7680, 1024, 1024, 7680, 7680] + - [215, 4859.0] + - - [1024, 200, 1, 6656, 1024, 1024, 6656, 6656] + - [199, 4912.0] + - - [512, 200, 1, 1024, 512, 512, 1024, 1024] + - [183, 4074.0] + - - [1024, 256, 1, 3840, 1024, 1024, 3840, 3840] + - [215, 6170.0] + - - [512, 512, 1, 768, 512, 512, 768, 768] + - [199, 6060.0] + - - [1024, 256, 1, 64, 1024, 1024, 64, 64] + - [191, 4112.0] + - - [1024, 200, 1, 1920, 1024, 1024, 1920, 1920] + - [174, 4855.0] + - - [1024, 256, 1, 7168, 1024, 1024, 7168, 7168] + - [183, 6226.0] + - - [512, 512, 1, 1792, 512, 512, 1792, 1792] + - [199, 6157.0] + - - [1024, 200, 1, 256, 1024, 1024, 256, 256] + - [215, 4362.0] + - - [256, 1024, 1, 1024, 256, 256, 1024, 1024] + - [199, 6068.0] + - - [1024, 200, 1, 640, 1024, 1024, 640, 640] + - [215, 4708.0] + - - [1024, 200, 1, 4160, 1024, 1024, 4160, 4160] + - [191, 4929.0] + - - [1024, 200, 1, 5632, 1024, 1024, 5632, 5632] + - [199, 4911.0] + - - [1024, 256, 1, 6656, 1024, 1024, 6656, 6656] + - [183, 6238.0] + - - [1024, 256, 1, 768, 1024, 1024, 768, 768] + - [215, 6031.0] + - - [512, 256, 1, 2048, 512, 512, 2048, 2048] + - [183, 5247.0] + - - [1024, 200, 1, 3584, 1024, 1024, 3584, 3584] + - [215, 4903.0] + - - [1024, 256, 1, 1408, 1024, 1024, 1408, 1408] + - [191, 6168.0] + - - [1024, 256, 1, 4096, 1024, 1024, 4096, 4096] + - [183, 6182.0] + - - [1024, 128, 1, 289, 1024, 1024, 289, 289] + - [191, 4200.0] + - - [768, 192, 1, 289, 768, 768, 289, 289] + - [191, 4767.0] + - - [32, 32, 1984, 64, 32, 32, 64, 64] + - [176, 3734.0] + - - [54, 54, 1184, 64, 54, 54, 64, 64] + - [183, 4915.0] + - - [35, 35, 1808, 64, 35, 35, 64, 64] + - [215, 2133.0] + - - [45, 45, 1424, 64, 45, 45, 64, 64] + - [191, 3502.0] + - - [49, 49, 1296, 64, 49, 49, 64, 64] + - [183, 4088.0] + - - [59, 59, 1088, 64, 59, 59, 64, 64] + - [174, 5832.0] + - - [41, 41, 1552, 64, 41, 41, 64, 64] + - [191, 2905.0] + - - [38, 38, 1680, 64, 38, 38, 64, 64] + - [183, 2523.0] + - - [2048, 128, 1, 4096, 2048, 2048, 4096, 4096] + - [199, 6204.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [199, 5057.0] + - - [1152, 128, 1, 784, 1152, 1152, 784, 784] + - [199, 5612.0] + - - [864, 96, 1, 1225, 864, 864, 1225, 1225] + - [170, 3435.0] + - - [896, 192, 1, 289, 896, 896, 289, 289] + - [204, 4157.0] + - - [768, 128, 1, 289, 768, 768, 289, 289] + - [170, 3214.0] + - - [1344, 192, 1, 289, 1344, 1344, 289, 289] + - [174, 5541.0] + - - [384, 192, 1, 1225, 384, 384, 1225, 1225] + - [205, 3442.0] + - - [832, 192, 1, 49, 832, 832, 49, 49] + - [204, 2262.0] + - - [1280, 192, 1, 64, 1280, 1280, 64, 64] + - [191, 3836.0] + - - [512, 256, 1, 196, 512, 512, 196, 196] + - [172, 3745.0] + - - [864, 96, 1, 289, 864, 864, 289, 289] + - [170, 2833.0] + - - [896, 128, 1, 289, 896, 896, 289, 289] + - [215, 3741.0] + - - [1200, 64, 1, 1225, 1200, 1200, 1225, 1225] + - [188, 3224.0] + - - [1024, 256, 1, 289, 1024, 1024, 289, 289] + - [191, 5538.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [191, 5286.0] + - - [1120, 192, 1, 289, 1120, 1120, 289, 289] + - [174, 4645.0] + - - [800, 96, 1, 784, 800, 800, 784, 784] + - [188, 3179.0] + - - [864, 128, 1, 784, 864, 864, 784, 784] + - [199, 4205.0] + - - [1344, 224, 1, 289, 1344, 1344, 289, 289] + - [204, 4599.0] + - - [1152, 192, 1, 784, 1152, 1152, 784, 784] + - [174, 5152.0] + - - [800, 128, 1, 196, 800, 800, 196, 196] + - [188, 3060.0] + - - [864, 208, 1, 196, 864, 864, 196, 196] + - [204, 4021.0] + - - [720, 192, 1, 5041, 720, 720, 5041, 5041] + - [174, 5618.0] + - - [576, 192, 1, 3136, 576, 576, 3136, 3136] + - [174, 4713.0] + - - [832, 256, 1, 49, 832, 832, 49, 49] + - [204, 2932.0] + - - [1200, 128, 1, 49, 1200, 1200, 49, 49] + - [174, 2114.0] + - - [528, 256, 1, 196, 528, 528, 196, 196] + - [215, 3873.0] + - - [256, 512, 1, 784, 256, 256, 784, 784] + - [199, 5123.0] + - - [480, 192, 1, 196, 480, 480, 196, 196] + - [179, 2796.0] + - - [96, 64, 36, 2592, 96, 96, 2592, 2592] + - [206, 5270.0] + - - [96, 96, 36, 2592, 96, 96, 2592, 2592] + - [170, 4795.0] + - - [1024, 192, 1, 289, 1024, 1024, 289, 289] + - [204, 4688.0] + - - [528, 160, 1, 196, 528, 528, 196, 196] + - [169, 2571.0] + - - [512, 160, 1, 196, 512, 512, 196, 196] + - [169, 2590.0] + - - [768, 160, 1, 289, 768, 768, 289, 289] + - [172, 3928.0] + - - [64, 32, 36, 43808, 64, 64, 43808, 43808] + - [187, 2982.0] + - - [832, 160, 1, 49, 832, 832, 49, 49] + - [173, 1843.0] + - - [2048, 64, 1, 1001, 2048, 2048, 1001, 1001] + - [191, 4747.0] + - - [2048, 128, 1, 1001, 2048, 2048, 1001, 1001] + - [191, 6013.0] + - - [1536, 64, 1, 1001, 1536, 1536, 1001, 1001] + - [203, 3994.0] + - - [96, 96, 49, 3136, 96, 96, 3136, 3136] + - [203, 4918.0] + - - [64, 32, 49, 57600, 64, 64, 57600, 57600] + - [217, 2882.0] + - - [96, 64, 49, 6272, 96, 96, 6272, 6272] + - [183, 4435.0] + - - [64, 32, 49, 115200, 64, 64, 115200, 115200] + - [201, 2651.0] + - - [96, 96, 64, 2304, 96, 96, 2304, 2304] + - [197, 4573.0] + - - [96, 96, 49, 6272, 96, 96, 6272, 6272] + - [189, 4164.0] + - - [96, 64, 36, 5184, 96, 96, 5184, 5184] + - [199, 5288.0] + - - [64, 32, 64, 40000, 64, 64, 40000, 40000] + - [189, 4074.0] + - - [96, 64, 64, 4608, 96, 96, 4608, 4608] + - [215, 4319.0] + - - [96, 96, 36, 5184, 96, 96, 5184, 5184] + - [188, 4810.0] + - - [96, 64, 64, 2304, 96, 96, 2304, 2304] + - [215, 4703.0] + - - [96, 64, 49, 3136, 96, 96, 3136, 3136] + - [191, 4824.0] + - - [64, 32, 36, 87616, 64, 64, 87616, 87616] + - [214, 2947.0] + - - [64, 32, 64, 80000, 64, 64, 80000, 80000] + - [209, 3383.0] + - - [96, 96, 64, 4608, 96, 96, 4608, 4608] + - [197, 4016.0] + - - [64, 32, 36, 175232, 64, 64, 175232, 175232] + - [198, 2740.0] + - - [128, 128, 11, 3264, 128, 128, 3264, 3264] + - [188, 4923.0] + - - [192, 128, 11, 6528, 192, 192, 6528, 6528] + - [191, 6456.0] + - - [128, 128, 11, 6528, 128, 128, 6528, 6528] + - [189, 4939.0] + - - [160, 160, 9, 4896, 160, 160, 4896, 4896] + - [172, 4725.0] + - - [192, 160, 11, 6528, 192, 192, 6528, 6528] + - [189, 5533.0] + - - [192, 128, 9, 4896, 192, 192, 4896, 4896] + - [191, 5337.0] + - - [128, 128, 9, 4896, 128, 128, 4896, 4896] + - [191, 6280.0] + - - [192, 128, 11, 3264, 192, 192, 3264, 3264] + - [174, 6475.0] + - - [160, 160, 11, 3264, 160, 160, 3264, 3264] + - [203, 5082.0] + - - [192, 160, 9, 4896, 192, 192, 4896, 4896] + - [204, 5663.0] + - - [192, 160, 11, 3264, 192, 192, 3264, 3264] + - [172, 5530.0] + - - [160, 160, 11, 6528, 160, 160, 6528, 6528] + - [195, 5071.0] + - - [4096, 64, 1, 1024, 4096, 4096, 1024, 1024] + - [215, 6019.0] + - - [49, 49, 160, 64, 49, 49, 64, 64] + - [191, 2941.0] + - - [54, 54, 592, 64, 54, 54, 64, 64] + - [183, 4596.0] + - - [59, 59, 512, 64, 59, 59, 64, 64] + - [206, 5360.0] + - - [104, 104, 16, 64, 104, 104, 64, 64] + - [174, 2564.0] + - - [32, 32, 624, 64, 32, 32, 64, 64] + - [173, 3117.0] + - - [32, 32, 992, 64, 32, 32, 64, 64] + - [176, 3365.0] + - - [35, 35, 384, 64, 35, 35, 64, 64] + - [183, 1886.0] + - - [35, 35, 904, 64, 35, 35, 64, 64] + - [183, 2075.0] + - - [38, 38, 320, 64, 38, 38, 64, 64] + - [199, 2194.0] + - - [38, 38, 840, 64, 38, 38, 64, 64] + - [191, 2444.0] + - - [41, 41, 312, 64, 41, 41, 64, 64] + - [174, 2494.0] + - - [41, 41, 776, 64, 41, 41, 64, 64] + - [174, 2768.0] + - - [45, 45, 392, 64, 45, 45, 64, 64] + - [183, 3117.0] + - - [45, 45, 712, 64, 45, 45, 64, 64] + - [199, 3326.0] + - - [49, 49, 648, 64, 49, 49, 64, 64] + - [215, 3899.0] + - - [54, 54, 200, 64, 54, 54, 64, 64] + - [215, 3778.0] + - - [59, 59, 544, 64, 59, 59, 64, 64] + - [206, 5469.0] + - - [91, 91, 40, 64, 91, 91, 64, 64] + - [203, 3232.0] + - - [91, 93, 40, 64, 91, 91, 64, 64] + - [203, 3333.0] + - - [93, 93, 40, 64, 93, 93, 64, 64] + - [203, 3375.0] + - - [102, 102, 56, 64, 102, 102, 64, 64] + - [174, 3491.0] + - - [103, 103, 16, 64, 103, 103, 64, 64] + - [174, 2480.0] + - - [103, 104, 16, 64, 103, 103, 64, 64] + - [191, 2516.0] + - - [112, 112, 16, 64, 112, 112, 64, 64] + - [191, 3015.0] + - - [112, 123, 16, 64, 112, 112, 64, 64] + - [191, 3327.0] + - - [119, 119, 32, 64, 119, 119, 64, 64] + - [174, 4191.0] + - - [119, 135, 32, 64, 119, 119, 64, 64] + - [189, 3926.0] + - - [123, 123, 16, 64, 123, 123, 64, 64] + - [174, 3383.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [199, 5955.0] + - - [513, 512, 1, 512, 513, 513, 512, 512] + - [183, 5727.0] + - - [512, 512, 1, 513, 512, 512, 513, 513] + - [199, 5888.0] + - - [512, 512, 1, 511, 512, 512, 511, 511] + - [191, 5943.0] + - - [512, 513, 1, 512, 512, 512, 512, 512] + - [215, 5792.0] + - - [512, 511, 1, 512, 512, 512, 512, 512] + - [199, 5834.0] + - - [511, 512, 1, 512, 511, 511, 512, 512] + - [183, 5814.0] + - - [479, 512, 1, 512, 479, 479, 512, 512] + - [199, 5493.0] + - - [480, 511, 1, 512, 480, 480, 512, 512] + - [215, 5494.0] + - - [480, 512, 1, 511, 480, 480, 511, 511] + - [174, 5577.0] + - - [480, 512, 1, 513, 480, 480, 513, 513] + - [174, 5554.0] + - - [480, 513, 1, 512, 480, 480, 512, 512] + - [183, 5520.0] + - - [481, 512, 1, 512, 481, 481, 512, 512] + - [183, 5516.0] + - - [511, 480, 1, 512, 511, 511, 512, 512] + - [183, 5432.0] + - - [512, 479, 1, 512, 512, 512, 512, 512] + - [199, 5498.0] + - - [512, 480, 1, 511, 512, 512, 511, 511] + - [191, 5542.0] + - - [512, 480, 1, 513, 512, 512, 513, 513] + - [191, 5549.0] + - - [512, 481, 1, 512, 512, 512, 512, 512] + - [199, 5530.0] + - - [513, 480, 1, 512, 513, 513, 512, 512] + - [199, 5430.0] + - - [480, 512, 1, 512, 480, 480, 512, 512] + - [183, 5524.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [199, 5480.0] + - - [512, 512, 1, 64, 512, 512, 64, 64] + - [191, 4092.0] + - - [2048, 114, 1, 512, 2048, 2048, 512, 512] + - [199, 5220.0] + - - [2048, 114, 1, 768, 2048, 2048, 768, 768] + - [199, 5362.0] + - - [256, 684, 1, 1024, 256, 256, 1024, 1024] + - [213, 4633.0] + - - [33, 33, 1600, 32, 33, 33, 32, 32] + - [191, 1833.0] + - - [383, 384, 1, 384, 383, 383, 384, 384] + - [215, 5070.0] + - - [385, 384, 1, 384, 385, 385, 384, 384] + - [181, 3648.0] + - - [384, 383, 1, 384, 384, 384, 384, 384] + - [215, 5088.0] + - - [384, 385, 1, 384, 384, 384, 384, 384] + - [181, 3648.0] + - - [384, 384, 1, 383, 384, 384, 383, 383] + - [215, 5172.0] + - - [384, 384, 1, 385, 384, 384, 385, 385] + - [215, 5180.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [199, 5262.0] + - - [128, 64, 25, 6498, 128, 128, 6498, 6498] + - [204, 5592.0] + - - [128, 64, 25, 6859, 128, 128, 6859, 6859] + - [189, 5501.0] + - - [64, 64, 64, 3042, 64, 64, 3042, 3042] + - [191, 6223.0] + - - [64, 64, 64, 3211, 64, 64, 3211, 3211] + - [183, 6170.0] + - - [64, 64, 49, 4050, 64, 64, 4050, 4050] + - [189, 5439.0] + - - [64, 64, 49, 4275, 64, 64, 4275, 4275] + - [204, 5448.0] + - - [64, 64, 36, 6498, 64, 64, 6498, 6498] + - [215, 6148.0] + - - [64, 64, 36, 6859, 64, 64, 6859, 6859] + - [215, 5758.0] + - - [1152, 128, 1, 1444, 1152, 1152, 1444, 1444] + - [174, 5627.0] + - - [512, 256, 1, 361, 512, 512, 361, 361] + - [172, 4397.0] + - - [576, 128, 1, 1444, 576, 576, 1444, 1444] + - [190, 3454.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [197, 4976.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 4340.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [213, 4761.0] + - - [32, 32, 4608, 64, 32, 32, 64, 64] + - [208, 3909.0] + - - [32, 35, 4608, 64, 32, 32, 64, 64] + - [203, 3169.0] + - - [34, 34, 4736, 64, 34, 34, 64, 64] + - [199, 2059.0] + - - [35, 35, 4608, 64, 35, 35, 64, 64] + - [174, 2186.0] + - - [128, 864, 1, 256, 128, 128, 256, 256] + - [197, 3513.0] + - - [256, 864, 1, 512, 256, 256, 512, 512] + - [199, 4898.0] + - - [512, 256, 1, 784, 512, 512, 784, 784] + - [199, 5133.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 4023.0] + - - [1024, 256, 1, 3800, 1024, 1024, 3800, 3800] + - [199, 6204.0] + - - [1024, 256, 1, 3400, 1024, 1024, 3400, 3400] + - [199, 6196.0] + - - [256, 1024, 1, 3400, 256, 256, 3400, 3400] + - [183, 6242.0] + - - [1024, 256, 1, 3220, 1024, 1024, 3220, 3220] + - [215, 6207.0] + - - [256, 1024, 1, 3220, 256, 256, 3220, 3220] + - [191, 6242.0] + - - [1024, 256, 1, 3456, 1024, 1024, 3456, 3456] + - [199, 6261.0] + - - [256, 1024, 1, 3456, 256, 256, 3456, 3456] + - [199, 6255.0] + - - [256, 1024, 1, 3072, 256, 256, 3072, 3072] + - [199, 6159.0] + - - [1024, 256, 1, 3552, 1024, 1024, 3552, 3552] + - [191, 6292.0] + - - [256, 1024, 1, 3552, 256, 256, 3552, 3552] + - [191, 6298.0] + - - [256, 1024, 1, 2852, 256, 256, 2852, 2852] + - [191, 6233.0] + - - [1024, 256, 1, 2852, 1024, 1024, 2852, 2852] + - [215, 6198.0] + - - [256, 512, 1, 10752, 256, 256, 10752, 10752] + - [213, 5347.0] + - - [256, 1024, 1, 3800, 256, 256, 3800, 3800] + - [183, 6252.0] + - - [256, 512, 1, 10560, 256, 256, 10560, 10560] + - [191, 5660.0] + - - [256, 1024, 1, 2992, 256, 256, 2992, 2992] + - [191, 6271.0] + - - [256, 1024, 1, 2688, 256, 256, 2688, 2688] + - [206, 6247.0] + - - [1024, 256, 1, 2688, 1024, 1024, 2688, 2688] + - [215, 6245.0] + - - [256, 1024, 1, 2904, 256, 256, 2904, 2904] + - [191, 6225.0] + - - [1024, 256, 1, 2904, 1024, 1024, 2904, 2904] + - [215, 6185.0] + - - [256, 1024, 1, 2640, 256, 256, 2640, 2640] + - [191, 6257.0] + - - [1024, 256, 1, 2640, 1024, 1024, 2640, 2640] + - [191, 6239.0] + - - [1024, 256, 1, 4032, 1024, 1024, 4032, 4032] + - [191, 6300.0] + - - [1024, 256, 1, 2992, 1024, 1024, 2992, 2992] + - [199, 6246.0] + - - [256, 1024, 1, 3360, 256, 256, 3360, 3360] + - [191, 6296.0] + - - [1024, 256, 1, 3360, 1024, 1024, 3360, 3360] + - [191, 6286.0] + - - [1024, 256, 1, 3500, 1024, 1024, 3500, 3500] + - [199, 6188.0] + - - [256, 1024, 1, 3500, 256, 256, 3500, 3500] + - [199, 6224.0] + - - [1024, 256, 1, 3168, 1024, 1024, 3168, 3168] + - [191, 6280.0] + - - [256, 1024, 1, 3168, 256, 256, 3168, 3168] + - [191, 6295.0] + - - [256, 1024, 1, 3036, 256, 256, 3036, 3036] + - [191, 6258.0] + - - [1024, 256, 1, 4200, 1024, 1024, 4200, 4200] + - [199, 6209.0] + - - [1024, 256, 1, 3600, 1024, 1024, 3600, 3600] + - [199, 6258.0] + - - [256, 1024, 1, 3600, 256, 256, 3600, 3600] + - [199, 6275.0] + - - [256, 1024, 1, 2944, 256, 256, 2944, 2944] + - [191, 6255.0] + - - [1024, 256, 1, 2944, 1024, 1024, 2944, 2944] + - [215, 6250.0] + - - [1024, 256, 1, 3700, 1024, 1024, 3700, 3700] + - [191, 6218.0] + - - [256, 1024, 1, 2352, 256, 256, 2352, 2352] + - [191, 6261.0] + - - [1024, 256, 1, 2352, 1024, 1024, 2352, 2352] + - [191, 6229.0] + - - [256, 1024, 1, 3700, 256, 256, 3700, 3700] + - [183, 6244.0] + - - [256, 1024, 1, 2816, 256, 256, 2816, 2816] + - [199, 6174.0] + - - [256, 512, 1, 11408, 256, 256, 11408, 11408] + - [191, 5494.0] + - - [1024, 256, 1, 3036, 1024, 1024, 3036, 3036] + - [191, 6226.0] + - - [1024, 256, 1, 3264, 1024, 1024, 3264, 3264] + - [191, 6282.0] + - - [256, 1024, 1, 3264, 256, 256, 3264, 3264] + - [191, 6284.0] + - - [1024, 256, 1, 3864, 1024, 1024, 3864, 3864] + - [199, 6204.0] + - - [256, 1024, 1, 4032, 256, 256, 4032, 4032] + - [215, 6301.0] + - - [1024, 256, 1, 3128, 1024, 1024, 3128, 3128] + - [199, 6193.0] + - - [256, 1024, 1, 3128, 256, 256, 3128, 3128] + - [183, 6239.0] + - - [256, 1024, 1, 3200, 256, 256, 3200, 3200] + - [191, 6262.0] + - - [256, 512, 1, 11616, 256, 256, 11616, 11616] + - [206, 5656.0] + - - [1024, 256, 1, 4000, 1024, 1024, 4000, 4000] + - [199, 6297.0] + - - [256, 1024, 1, 2520, 256, 256, 2520, 2520] + - [191, 6233.0] + - - [1024, 256, 1, 2520, 1024, 1024, 2520, 2520] + - [199, 6176.0] + - - [256, 1024, 1, 2976, 256, 256, 2976, 2976] + - [191, 6286.0] + - - [256, 1024, 1, 2400, 256, 256, 2400, 2400] + - [191, 6279.0] + - - [1024, 256, 1, 2400, 1024, 1024, 2400, 2400] + - [191, 6259.0] + - - [1024, 256, 1, 3696, 1024, 1024, 3696, 3696] + - [191, 6259.0] + - - [1024, 256, 1, 3900, 1024, 1024, 3900, 3900] + - [191, 6251.0] + - - [1024, 256, 1, 3772, 1024, 1024, 3772, 3772] + - [191, 6241.0] + - - [256, 1024, 1, 3696, 256, 256, 3696, 3696] + - [183, 6278.0] + - - [256, 1024, 1, 2728, 256, 256, 2728, 2728] + - [191, 6234.0] + - - [1024, 256, 1, 2728, 1024, 1024, 2728, 2728] + - [191, 6179.0] + - - [1024, 256, 1, 2480, 1024, 1024, 2480, 2480] + - [191, 6232.0] + - - [256, 1024, 1, 2480, 256, 256, 2480, 2480] + - [191, 6267.0] + - - [1024, 256, 1, 2880, 1024, 1024, 2880, 2880] + - [191, 6274.0] + - - [512, 256, 1, 3220, 512, 512, 3220, 3220] + - [172, 5236.0] + - - [256, 1024, 1, 2880, 256, 256, 2880, 2880] + - [191, 6290.0] + - - [256, 1024, 1, 4200, 256, 256, 4200, 4200] + - [183, 6254.0] + - - [1024, 256, 1, 3648, 1024, 1024, 3648, 3648] + - [191, 6289.0] + - - [1024, 256, 1, 3312, 1024, 1024, 3312, 3312] + - [191, 6251.0] + - - [256, 1024, 1, 3648, 256, 256, 3648, 3648] + - [191, 6296.0] + - - [1024, 256, 1, 3300, 1024, 1024, 3300, 3300] + - [215, 6213.0] + - - [1024, 256, 1, 3528, 1024, 1024, 3528, 3528] + - [199, 6202.0] + - - [256, 1024, 1, 2604, 256, 256, 2604, 2604] + - [191, 6218.0] + - - [1024, 256, 1, 2604, 1024, 1024, 2604, 2604] + - [191, 6168.0] + - - [512, 256, 1, 11408, 512, 512, 11408, 11408] + - [206, 5492.0] + - - [256, 1024, 1, 3312, 256, 256, 3312, 3312] + - [191, 6273.0] + - - [256, 1024, 1, 3300, 256, 256, 3300, 3300] + - [183, 6224.0] + - - [256, 1024, 1, 3528, 256, 256, 3528, 3528] + - [183, 6243.0] + - - [1024, 256, 1, 2976, 1024, 1024, 2976, 2976] + - [191, 6278.0] + - - [1024, 256, 1, 2760, 1024, 1024, 2760, 2760] + - [191, 6182.0] + - - [512, 256, 1, 3800, 512, 512, 3800, 3800] + - [172, 5275.0] + - - [256, 1024, 1, 2760, 256, 256, 2760, 2760] + - [191, 6247.0] + - - [1024, 256, 1, 2160, 1024, 1024, 2160, 2160] + - [191, 6220.0] + - - [256, 1024, 1, 2160, 256, 256, 2160, 2160] + - [191, 6259.0] + - - [512, 256, 1, 11616, 512, 512, 11616, 11616] + - [206, 5660.0] + - - [512, 256, 1, 2852, 512, 512, 2852, 2852] + - [172, 5219.0] + - - [256, 1024, 1, 3864, 256, 256, 3864, 3864] + - [199, 6240.0] + - - [512, 256, 1, 2640, 512, 512, 2640, 2640] + - [191, 5355.0] + - - [256, 1024, 1, 4000, 256, 256, 4000, 4000] + - [199, 6304.0] + - - [512, 256, 1, 2904, 512, 512, 2904, 2904] + - [172, 5247.0] + - - [256, 1024, 1, 3900, 256, 256, 3900, 3900] + - [183, 6264.0] + - - [512, 256, 1, 2688, 512, 512, 2688, 2688] + - [191, 5389.0] + - - [256, 1024, 1, 3772, 256, 256, 3772, 3772] + - [183, 6262.0] + - - [512, 256, 1, 3400, 512, 512, 3400, 3400] + - [172, 5266.0] + - - [512, 256, 1, 3456, 512, 512, 3456, 3456] + - [191, 5403.0] + - - [512, 256, 1, 3552, 512, 512, 3552, 3552] + - [191, 5565.0] + - - [29000, 35, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 3722.0] + - - [29000, 36, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 3840.0] + - - [29000, 39, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 4152.0] + - - [29000, 40, 1, 2560, 29000, 29000, 2560, 2560] + - [183, 4247.0] + - - [29000, 42, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 4458.0] + - - [29000, 43, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 4570.0] + - - [29000, 44, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 4672.0] + - - [29000, 46, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 4883.0] + - - [29000, 48, 1, 2560, 29000, 29000, 2560, 2560] + - [183, 5079.0] + - - [29000, 49, 1, 2560, 29000, 29000, 2560, 2560] + - [183, 5204.0] + - - [29000, 50, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 5287.0] + - - [29000, 51, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 5399.0] + - - [29000, 53, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 5613.0] + - - [29000, 54, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 5701.0] + - - [29000, 55, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 5802.0] + - - [29000, 56, 1, 2560, 29000, 29000, 2560, 2560] + - [215, 5908.0] + - - [29000, 57, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 6013.0] + - - [29000, 58, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 6120.0] + - - [29000, 59, 1, 2560, 29000, 29000, 2560, 2560] + - [183, 6226.0] + - - [29000, 61, 1, 2560, 29000, 29000, 2560, 2560] + - [183, 6414.0] + - - [29000, 63, 1, 2560, 29000, 29000, 2560, 2560] + - [199, 6636.0] + - - [288, 64, 1, 21609, 288, 288, 21609, 21609] + - [232, 3283.0] + - - [32, 32, 36, 43808, 32, 32, 43808, 43808] + - [223, 3074.0] + - - [32, 32, 64, 40000, 32, 32, 40000, 40000] + - [234, 2931.0] + - - [32, 32, 49, 115200, 32, 32, 115200, 115200] + - [239, 3051.0] + - - [32, 32, 36, 175232, 32, 32, 175232, 175232] + - [231, 3169.0] + - - [32, 32, 49, 57600, 32, 32, 57600, 57600] + - [239, 3010.0] + - - [32, 32, 36, 87616, 32, 32, 87616, 87616] + - [223, 3183.0] + - - [32, 32, 64, 80000, 32, 32, 80000, 80000] + - [228, 2918.0] + - - [256, 128, 1, 13600, 256, 256, 13600, 13600] + - [219, 4715.0] + - - [256, 128, 1, 12880, 256, 256, 12880, 12880] + - [219, 4709.0] + - - [128, 512, 1, 15200, 128, 128, 15200, 15200] + - [237, 5849.0] + - - [512, 128, 1, 15200, 512, 512, 15200, 15200] + - [237, 5828.0] + - - [128, 512, 1, 11408, 128, 128, 11408, 11408] + - [224, 5746.0] + - - [256, 128, 1, 13824, 256, 256, 13824, 13824] + - [219, 4591.0] + - - [128, 512, 1, 11616, 128, 128, 11616, 11616] + - [224, 5746.0] + - - [256, 128, 1, 14208, 256, 256, 14208, 14208] + - [233, 4680.0] + - - [128, 512, 1, 14208, 128, 128, 14208, 14208] + - [220, 5796.0] + - - [256, 128, 1, 15200, 256, 256, 15200, 15200] + - [219, 4775.0] + - - [512, 128, 1, 11408, 512, 512, 11408, 11408] + - [230, 5730.0] + - - [512, 128, 1, 16800, 512, 512, 16800, 16800] + - [230, 5860.0] + - - [128, 512, 1, 11264, 128, 128, 11264, 11264] + - [230, 5607.0] + - - [512, 128, 1, 11616, 512, 512, 11616, 11616] + - [235, 5720.0] + - - [512, 128, 1, 16128, 512, 512, 16128, 16128] + - [237, 5815.0] + - - [512, 128, 1, 11968, 512, 512, 11968, 11968] + - [237, 5732.0] + - - [128, 512, 1, 11968, 128, 128, 11968, 11968] + - [224, 5766.0] + - - [512, 128, 1, 12288, 512, 512, 12288, 12288] + - [237, 5499.0] + - - [128, 512, 1, 12288, 128, 128, 12288, 12288] + - [230, 5578.0] + - - [128, 512, 1, 12672, 128, 128, 12672, 12672] + - [220, 5755.0] + - - [512, 128, 1, 11776, 512, 512, 11776, 11776] + - [237, 5678.0] + - - [512, 128, 1, 12144, 512, 512, 12144, 12144] + - [226, 5750.0] + - - [512, 128, 1, 11264, 512, 512, 11264, 11264] + - [230, 5581.0] + - - [128, 512, 1, 12144, 128, 128, 12144, 12144] + - [224, 5769.0] + - - [512, 128, 1, 12672, 512, 512, 12672, 12672] + - [230, 5710.0] + - - [128, 512, 1, 12512, 128, 128, 12512, 12512] + - [230, 5779.0] + - - [128, 512, 1, 11776, 128, 128, 11776, 11776] + - [224, 5682.0] + - - [256, 128, 1, 12288, 256, 256, 12288, 12288] + - [240, 4057.0] + - - [40, 40, 1, 1909283, 40, 40, 1909283, 1909283] + - [222, 443.0] + - - [40, 40, 1, 3818566, 40, 40, 3818566, 3818566] + - [222, 443.0] + - - [30522, 20, 1, 1024, 30522, 30522, 1024, 1024] + - [196, 2111.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [241, 2881.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 1280] + - [242, 808.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 256] + - [245, 544.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 3328] + - [245, 942.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [196, 1895.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 1280] + - [246, 581.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 1280] + - [245, 981.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 1280] + - [245, 510.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 128] + - [245, 423.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 128] + - [245, 488.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [243, 2838.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 256] + - [245, 451.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 128] + - [245, 688.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 256] + - [245, 711.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 3328] + - [245, 882.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [180, 2002.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 256] + - [246, 286.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 3328] + - [242, 975.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 3328] + - [242, 623.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 1280] + - [245, 939.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [196, 2453.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 3328] + - [245, 413.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [243, 2871.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [212, 2719.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 256] + - [245, 371.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 128] + - [245, 712.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 128] + - [246, 563.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 3328] + - [242, 1060.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 128] + - [245, 619.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 1280] + - [245, 1000.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 3328] + - [242, 763.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 128] + - [246, 348.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 128] + - [245, 275.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [180, 2212.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [247, 3031.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 1280] + - [245, 388.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 256] + - [245, 712.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 3328] + - [242, 545.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 256] + - [245, 622.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 3328] + - [245, 1014.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [247, 3137.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 128] + - [242, 210.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 256] + - [246, 725.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 1280] + - [245, 901.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [196, 2759.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 1280] + - [246, 705.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 256] + - [245, 779.0] + - - [2048, 32, 1, 1001, 2048, 2048, 1001, 1001] + - [243, 2862.0] + - - [1536, 32, 1, 1001, 1536, 1536, 1001, 1001] + - [180, 2428.0] + - - [1600, 1, 1, 1024, 1600, 1600, 1024, 1024] + - [246, 110.0] + - - [32768, 1, 1, 256, 32768, 32768, 256, 256] + - [244, 250.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [242, 286.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [246, 642.0] + - - [3456, 1, 1, 256, 3456, 3456, 256, 256] + - [245, 153.0] + - - [4096, 1, 1, 256, 4096, 4096, 256, 256] + - [245, 172.0] + - - [6912, 1, 1, 256, 6912, 6912, 256, 256] + - [246, 188.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [246, 1108.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [242, 322.0] + - - [29000, 27, 1, 2560, 29000, 29000, 2560, 2560] + - [212, 2676.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 3328] + - [194, 532.0] + - - [4, 1408, 1, 128, 4, 4, 128, 128] + - [178, 220.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 1280] + - [178, 577.0] + - - [4, 3584, 1, 128, 4, 4, 128, 128] + - [249, 475.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 3328] + - [249, 949.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 3328] + - [184, 422.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 3328] + - [250, 911.0] + - - [4, 4288, 1, 128, 4, 4, 128, 128] + - [249, 552.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 1280] + - [250, 904.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 3328] + - [178, 721.0] + - - [4, 5056, 1, 256, 4, 4, 256, 256] + - [249, 705.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 1280] + - [249, 838.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 3328] + - [210, 612.0] + - - [4, 1856, 1, 256, 4, 4, 256, 256] + - [178, 385.0] + - - [4, 2368, 1, 256, 4, 4, 256, 256] + - [194, 449.0] + - - [4, 2944, 1, 256, 4, 4, 256, 256] + - [250, 533.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 1280] + - [249, 858.0] + - - [4, 6784, 1, 128, 4, 4, 128, 128] + - [249, 668.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 1280] + - [249, 760.0] + - - [4, 5888, 1, 256, 4, 4, 256, 256] + - [248, 767.0] + - - [4, 6784, 1, 256, 4, 4, 256, 256] + - [250, 722.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1280] + - [184, 402.0] + - - [4, 3584, 1, 256, 4, 4, 256, 256] + - [249, 604.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 1280] + - [178, 685.0] + - - [4, 1408, 1, 256, 4, 4, 256, 256] + - [178, 303.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 3328] + - [248, 914.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 1280] + - [248, 904.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1280] + - [178, 508.0] + - - [4, 1856, 1, 128, 4, 4, 128, 128] + - [178, 285.0] + - - [4, 2944, 1, 128, 4, 4, 128, 128] + - [249, 416.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 3328] + - [250, 869.0] + - - [4, 5056, 1, 128, 4, 4, 128, 128] + - [250, 599.0] + - - [4, 4288, 1, 256, 4, 4, 256, 256] + - [250, 673.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3328] + - [250, 813.0] + - - [4, 5888, 1, 128, 4, 4, 128, 128] + - [249, 644.0] + - - [4, 2368, 1, 128, 4, 4, 128, 128] + - [178, 352.0] + - - [32, 1600, 1, 512, 32, 32, 512, 512] + - [178, 2686.0] + - - [2, 2048, 1, 1024, 2, 2, 1024, 1024] + - [178, 274.0] + - - [1, 4096, 1, 256, 1, 1, 256, 256] + - [249, 166.0] + - - [1, 6912, 1, 256, 1, 1, 256, 256] + - [210, 185.0] + - - [2, 2048, 1, 768, 2, 2, 768, 768] + - [178, 266.0] + - - [2, 4608, 1, 768, 2, 2, 768, 768] + - [250, 446.0] + - - [2, 4608, 1, 1024, 2, 2, 1024, 1024] + - [248, 444.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [238, 2376.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [236, 1189.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [225, 1885.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [225, 940.0] + - - [64, 80, 1, 5329, 64, 64, 5329, 5329] + - [236, 1112.0] + - - [576, 96, 1, 5329, 576, 576, 5329, 5329] + - [219, 3957.0] + - - [288, 32, 1, 21609, 288, 288, 21609, 21609] + - [221, 2314.0] + - - [576, 96, 1, 5041, 576, 576, 5041, 5041] + - [219, 4123.0] + - - [27, 32, 1, 22201, 27, 27, 22201, 22201] + - [222, 296.0] + - - [160, 64, 1, 5329, 160, 160, 5329, 5329] + - [218, 1692.0] + - - [448, 64, 1, 5329, 448, 448, 5329, 5329] + - [233, 3552.0] + - - [147, 64, 1, 12544, 147, 147, 12544, 12544] + - [236, 1835.0] + - - [147, 64, 1, 22500, 147, 147, 22500, 22500] + - [218, 2042.0] + - - [576, 64, 1, 5625, 576, 576, 5625, 5625] + - [219, 4508.0] + - - [256, 128, 1, 10752, 256, 256, 10752, 10752] + - [229, 4285.0] + - - [256, 128, 1, 10560, 256, 256, 10560, 10560] + - [219, 4581.0] + - - [256, 128, 1, 11408, 256, 256, 11408, 11408] + - [219, 4645.0] + - - [256, 12, 1, 11408, 256, 256, 11408, 11408] + - [227, 867.0] + - - [256, 128, 1, 11616, 256, 256, 11616, 11616] + - [219, 4642.0] + - - [256, 12, 1, 11616, 256, 256, 11616, 11616] + - [227, 873.0] + - - [256, 12, 1, 12288, 256, 256, 12288, 12288] + - [238, 873.0] + - - [11, 11, 1, 1909283, 11, 11, 1909283, 1909283] + - [222, 46.0] + - - [11, 11, 1, 3818566, 11, 11, 3818566, 3818566] + - [222, 46.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [178, 1673.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [178, 2860.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 3347.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 1404.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [210, 904.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [194, 306.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [194, 460.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 1280] + - [194, 220.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [212, 474.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [197, 2901.0] + - - [64, 4, 1, 256, 64, 64, 256, 256] + - [169, 13.0] + - - [64, 704, 1, 128, 64, 64, 128, 128] + - [169, 1748.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [178, 2078.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 1280] + - [184, 43.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [204, 3142.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [178, 2866.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 128] + - [182, 2097.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [197, 3077.0] + - - [4, 704, 1, 256, 4, 4, 256, 256] + - [210, 152.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 1280] + - [210, 219.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [178, 1942.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [197, 3367.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 1280] + - [169, 20.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [186, 366.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [202, 2524.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [178, 2114.0] + - - [448, 4, 1, 256, 448, 448, 256, 256] + - [184, 97.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [169, 20.0] + - - [256, 4, 1, 128, 256, 256, 128, 128] + - [169, 39.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [178, 2931.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [178, 1332.0] + - - [704, 64, 1, 128, 704, 704, 128, 128] + - [178, 1727.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 256] + - [194, 220.0] + - - [256, 256, 1, 128, 256, 256, 128, 128] + - [178, 2118.0] + - - [64, 256, 1, 128, 64, 64, 128, 128] + - [178, 699.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [178, 2881.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [178, 2338.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [178, 2375.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [187, 2215.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [187, 1303.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [193, 731.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [169, 1379.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [178, 1762.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [178, 2234.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 3328] + - [210, 339.0] + - - [4, 4, 1, 256, 4, 4, 256, 256] + - [169, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [178, 916.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [178, 2416.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [184, 673.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 3328] + - [186, 159.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [181, 2542.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 1280] + - [184, 85.0] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [178, 2176.0] + - - [4, 704, 1, 128, 4, 4, 128, 128] + - [178, 109.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [210, 2315.0] + - - [448, 64, 1, 128, 448, 448, 128, 128] + - [178, 1184.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1280] + - [194, 319.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 1280] + - [184, 143.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 1280] + - [184, 150.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [198, 2558.0] + - - [256, 64, 1, 128, 256, 256, 128, 128] + - [212, 685.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 3328] + - [194, 340.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [212, 352.0] + - - [704, 4, 1, 128, 704, 704, 128, 128] + - [178, 109.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [178, 55.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 3328] + - [184, 93.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [178, 55.0] + - - [4, 4, 1, 128, 4, 4, 128, 128] + - [169, 1.0] + - - [4, 128, 1, 256, 4, 4, 256, 256] + - [178, 29.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [200, 334.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [213, 2962.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [178, 1589.0] + - - [4, 448, 1, 128, 4, 4, 128, 128] + - [169, 70.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [178, 1291.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [184, 666.0] + - - [64, 4, 1, 128, 64, 64, 128, 128] + - [169, 10.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [178, 247.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 3328] + - [194, 235.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 1280] + - [169, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [171, 708.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 128] + - [178, 158.0] + - - [4, 64, 1, 128, 4, 4, 128, 128] + - [169, 10.0] + - - [64, 1024, 1, 128, 64, 64, 128, 128] + - [190, 2118.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [178, 1304.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [178, 1816.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [178, 488.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 1280] + - [178, 316.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [178, 2201.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [186, 730.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [178, 1575.0] + - - [4, 256, 1, 128, 4, 4, 128, 128] + - [178, 40.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [182, 2527.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 3328] + - [169, 1.0] + - - [704, 4, 1, 256, 704, 704, 256, 256] + - [169, 151.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 3328] + - [184, 47.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [213, 3351.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 3328] + - [184, 163.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 3328] + - [177, 45.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 3328] + - [202, 234.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [197, 2901.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [189, 3261.0] + - - [4, 1024, 1, 128, 4, 4, 128, 128] + - [178, 158.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [169, 1375.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [178, 1327.0] + - - [128, 4, 1, 256, 128, 128, 256, 256] + - [169, 27.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [197, 3329.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [187, 2516.0] + - - [448, 4, 1, 128, 448, 448, 128, 128] + - [178, 70.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 3328] + - [186, 91.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [169, 20.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 1280] + - [184, 82.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 3328] + - [175, 23.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 3328] + - [177, 23.0] + - - [4, 1024, 1, 256, 4, 4, 256, 256] + - [178, 221.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [178, 966.0] + - - [4, 64, 1, 256, 4, 4, 256, 256] + - [178, 14.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [178, 1952.0] + - - [64, 448, 1, 128, 64, 64, 128, 128] + - [178, 1184.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [178, 3053.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [213, 3001.0] + - - [4, 448, 1, 256, 4, 4, 256, 256] + - [210, 97.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 1280] + - [184, 41.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [178, 350.0] + - - [64, 64, 1, 128, 64, 64, 128, 128] + - [212, 178.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 1280] + - [216, 22.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [169, 1378.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [196, 945.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [187, 1374.0] + - - [64, 200, 1, 1024, 64, 64, 1024, 1024] + - [178, 988.0] + - - [32, 512, 1, 1024, 32, 32, 1024, 1024] + - [178, 1265.0] + - - [1, 512, 1, 1024, 1, 1, 1024, 1024] + - [184, 40.0] + - - [128, 512, 1, 2048, 128, 128, 2048, 2048] + - [197, 3352.0] + - - [64, 256, 1, 1024, 64, 64, 1024, 1024] + - [178, 1269.0] + - - [1, 200, 1, 1024, 1, 1, 1024, 1024] + - [184, 16.0] + - - [128, 512, 1, 1024, 128, 128, 1024, 1024] + - [213, 3264.0] + - - [32, 256, 1, 2048, 32, 32, 2048, 2048] + - [186, 703.0] + - - [32, 256, 1, 512, 32, 32, 512, 512] + - [178, 575.0] + - - [256, 200, 1, 1024, 256, 256, 1024, 1024] + - [178, 2962.0] + - - [1, 256, 1, 2048, 1, 1, 2048, 2048] + - [186, 22.0] + - - [32, 200, 1, 2048, 32, 32, 2048, 2048] + - [184, 544.0] + - - [128, 200, 1, 1024, 128, 128, 1024, 1024] + - [178, 1859.0] + - - [128, 256, 1, 2048, 128, 128, 2048, 2048] + - [178, 2465.0] + - - [64, 1024, 1, 1024, 64, 64, 1024, 1024] + - [179, 3169.0] + - - [1, 512, 1, 2048, 1, 1, 2048, 2048] + - [184, 43.0] + - - [128, 256, 1, 512, 128, 128, 512, 512] + - [178, 2113.0] + - - [128, 200, 1, 2048, 128, 128, 2048, 2048] + - [210, 1974.0] + - - [64, 200, 1, 512, 64, 64, 512, 512] + - [178, 888.0] + - - [1, 256, 1, 1024, 1, 1, 1024, 1024] + - [178, 20.0] + - - [1, 1024, 1, 1024, 1, 1, 1024, 1024] + - [178, 78.0] + - - [256, 256, 1, 2048, 256, 256, 2048, 2048] + - [197, 3357.0] + - - [128, 256, 1, 1024, 128, 128, 1024, 1024] + - [178, 2330.0] + - - [1, 256, 1, 4096, 1, 1, 4096, 4096] + - [177, 23.0] + - - [32, 512, 1, 512, 32, 32, 512, 512] + - [178, 1143.0] + - - [64, 200, 1, 2048, 64, 64, 2048, 2048] + - [210, 1049.0] + - - [1, 200, 1, 2048, 1, 1, 2048, 2048] + - [177, 17.0] + - - [1, 512, 1, 4096, 1, 1, 4096, 4096] + - [186, 46.0] + - - [256, 256, 1, 1024, 256, 256, 1024, 1024] + - [213, 3303.0] + - - [64, 256, 1, 2048, 64, 64, 2048, 2048] + - [194, 1345.0] + - - [1, 200, 1, 4096, 1, 1, 4096, 4096] + - [177, 18.0] + - - [32, 256, 1, 1024, 32, 32, 1024, 1024] + - [200, 649.0] + - - [32, 200, 1, 1024, 32, 32, 1024, 1024] + - [200, 510.0] + - - [32, 512, 1, 2048, 32, 32, 2048, 2048] + - [194, 1338.0] + - - [128, 200, 1, 512, 128, 128, 512, 512] + - [210, 1655.0] + - - [64, 1024, 1, 2048, 64, 64, 2048, 2048] + - [179, 3274.0] + - - [1, 1024, 1, 2048, 1, 1, 2048, 2048] + - [178, 83.0] + - - [32, 1024, 1, 512, 32, 32, 512, 512] + - [178, 2031.0] + - - [64, 1024, 1, 512, 64, 64, 512, 512] + - [179, 2903.0] + - - [1, 1024, 1, 4096, 1, 1, 4096, 4096] + - [178, 86.0] + - - [64, 256, 1, 512, 64, 64, 512, 512] + - [178, 1149.0] + - - [256, 200, 1, 512, 256, 256, 512, 512] + - [178, 2748.0] + - - [32, 1024, 1, 1024, 32, 32, 1024, 1024] + - [178, 2228.0] + - - [32, 200, 1, 512, 32, 32, 512, 512] + - [178, 442.0] + - - [256, 256, 1, 512, 256, 256, 512, 512] + - [197, 3028.0] + - - [128, 512, 1, 512, 128, 128, 512, 512] + - [213, 3028.0] + - - [256, 200, 1, 2048, 256, 256, 2048, 2048] + - [178, 3113.0] + - - [64, 512, 1, 2048, 64, 64, 2048, 2048] + - [178, 2446.0] + - - [32, 1024, 1, 2048, 32, 32, 2048, 2048] + - [210, 2363.0] + - - [256, 64, 1, 1225, 256, 256, 1225, 1225] + - [207, 1230.0] + - - [384, 64, 1, 1225, 384, 384, 1225, 1225] + - [175, 1842.0] + - - [288, 64, 1, 1225, 288, 288, 1225, 1225] + - [207, 1382.0] + - - [384, 96, 1, 1225, 384, 384, 1225, 1225] + - [175, 2340.0] + - - [11, 11, 5456, 64, 11, 11, 64, 64] + - [202, 1645.0] + - - [14, 14, 4368, 64, 14, 14, 64, 64] + - [202, 2340.0] + - - [23, 23, 2720, 64, 23, 23, 64, 64] + - [201, 1990.0] + - - [13, 13, 4672, 64, 13, 13, 64, 64] + - [169, 2132.0] + - - [29, 29, 2176, 64, 29, 29, 64, 64] + - [201, 3055.0] + - - [12, 12, 5040, 64, 12, 12, 64, 64] + - [194, 1918.0] + - - [27, 27, 2336, 64, 27, 27, 64, 64] + - [217, 2710.0] + - - [10, 10, 5952, 64, 10, 10, 64, 64] + - [169, 1338.0] + - - [7, 7, 8192, 64, 7, 7, 64, 64] + - [202, 681.0] + - - [16, 16, 3840, 64, 16, 16, 64, 64] + - [175, 2801.0] + - - [17, 17, 3632, 64, 17, 17, 64, 64] + - [179, 1587.0] + - - [9, 9, 6544, 64, 9, 9, 64, 64] + - [210, 1135.0] + - - [8, 8, 7280, 64, 8, 8, 64, 64] + - [178, 911.0] + - - [21, 21, 2976, 64, 21, 21, 64, 64] + - [195, 1979.0] + - - [19, 19, 3264, 64, 19, 19, 64, 64] + - [179, 1968.0] + - - [25, 25, 2512, 64, 25, 25, 64, 64] + - [201, 2324.0] + - - [18, 18, 3440, 64, 18, 18, 64, 64] + - [179, 1761.0] + - - [15, 15, 4096, 64, 15, 15, 64, 64] + - [216, 2462.0] + - - [2, 16, 1, 768, 2, 2, 768, 768] + - [169, 2.0] + - - [2, 8, 1, 768, 2, 2, 768, 768] + - [169, 1.0] + - - [2, 64, 1, 768, 2, 2, 768, 768] + - [187, 10.0] + - - [256, 128, 1, 784, 256, 256, 784, 784] + - [169, 2134.0] + - - [192, 48, 1, 1225, 192, 192, 1225, 1225] + - [175, 693.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [169, 1383.0] + - - [512, 144, 1, 196, 512, 512, 196, 196] + - [187, 2608.0] + - - [400, 32, 1, 784, 400, 400, 784, 784] + - [175, 926.0] + - - [832, 48, 1, 49, 832, 832, 49, 49] + - [169, 843.0] + - - [192, 32, 1, 784, 192, 192, 784, 784] + - [184, 468.0] + - - [288, 48, 1, 1225, 288, 288, 1225, 1225] + - [207, 1039.0] + - - [512, 112, 1, 196, 512, 512, 196, 196] + - [187, 2187.0] + - - [528, 32, 1, 196, 528, 528, 196, 196] + - [200, 792.0] + - - [576, 64, 1, 3136, 576, 576, 3136, 3136] + - [187, 2755.0] + - - [480, 64, 1, 196, 480, 480, 196, 196] + - [194, 1356.0] + - - [192, 64, 1, 784, 192, 192, 784, 784] + - [175, 887.0] + - - [192, 32, 1, 1225, 192, 192, 1225, 1225] + - [177, 484.0] + - - [400, 48, 1, 196, 400, 400, 196, 196] + - [184, 896.0] + - - [480, 16, 1, 196, 480, 480, 196, 196] + - [184, 369.0] + - - [512, 64, 1, 196, 512, 512, 196, 196] + - [200, 1361.0] + - - [800, 64, 1, 196, 800, 800, 196, 196] + - [194, 2048.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [213, 3168.0] + - - [256, 64, 1, 784, 256, 256, 784, 784] + - [192, 1178.0] + - - [256, 48, 1, 1225, 256, 256, 1225, 1225] + - [175, 922.0] + - - [192, 16, 1, 784, 192, 192, 784, 784] + - [184, 233.0] + - - [576, 96, 1, 1225, 576, 576, 1225, 1225] + - [202, 2801.0] + - - [512, 128, 1, 196, 512, 512, 196, 196] + - [187, 2370.0] + - - [192, 96, 1, 784, 192, 192, 784, 784] + - [207, 1321.0] + - - [192, 64, 1, 1225, 192, 192, 1225, 1225] + - [175, 923.0] + - - [512, 32, 1, 196, 512, 512, 196, 196] + - [184, 765.0] + - - [528, 128, 1, 196, 528, 528, 196, 196] + - [173, 2444.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [213, 3168.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [177, 366.0] + - - [256, 32, 1, 784, 256, 256, 784, 784] + - [184, 635.0] + - - [480, 96, 1, 196, 480, 480, 196, 196] + - [178, 1866.0] + - - [1024, 32, 1, 1001, 1024, 1024, 1001, 1001] + - [175, 2040.0] + - - [18, 18, 648, 64, 18, 18, 64, 64] + - [170, 1355.0] + - - [7, 7, 736, 64, 7, 7, 64, 64] + - [178, 460.0] + - - [8, 8, 264, 64, 8, 8, 64, 64] + - [187, 381.0] + - - [9, 9, 416, 64, 9, 9, 64, 64] + - [202, 613.0] + - - [10, 10, 448, 64, 10, 10, 64, 64] + - [202, 739.0] + - - [11, 11, 568, 64, 11, 11, 64, 64] + - [178, 1004.0] + - - [12, 12, 480, 64, 12, 12, 64, 64] + - [202, 1084.0] + - - [12, 12, 2520, 64, 12, 12, 64, 64] + - [178, 1715.0] + - - [13, 13, 576, 64, 13, 13, 64, 64] + - [169, 1378.0] + - - [13, 13, 2336, 64, 13, 13, 64, 64] + - [169, 1908.0] + - - [14, 14, 704, 64, 14, 14, 64, 64] + - [169, 1660.0] + - - [14, 14, 2184, 64, 14, 14, 64, 64] + - [187, 2063.0] + - - [15, 15, 688, 64, 15, 15, 64, 64] + - [210, 1855.0] + - - [15, 15, 2048, 64, 15, 15, 64, 64] + - [175, 2191.0] + - - [16, 16, 712, 64, 16, 16, 64, 64] + - [178, 1925.0] + - - [16, 16, 1920, 64, 16, 16, 64, 64] + - [184, 2416.0] + - - [17, 17, 688, 64, 17, 17, 64, 64] + - [179, 1238.0] + - - [17, 17, 1816, 64, 17, 17, 64, 64] + - [203, 1476.0] + - - [18, 18, 1720, 64, 18, 18, 64, 64] + - [203, 1645.0] + - - [19, 19, 680, 64, 19, 19, 64, 64] + - [179, 1519.0] + - - [19, 19, 1632, 64, 19, 19, 64, 64] + - [195, 1742.0] + - - [21, 21, 1472, 64, 21, 21, 64, 64] + - [188, 1679.0] + - - [21, 21, 1488, 64, 21, 21, 64, 64] + - [179, 1717.0] + - - [23, 23, 64, 64, 23, 23, 64, 64] + - [202, 791.0] + - - [23, 23, 1360, 64, 23, 23, 64, 64] + - [169, 1832.0] + - - [25, 25, 176, 64, 25, 25, 64, 64] + - [179, 1578.0] + - - [25, 25, 1256, 64, 25, 25, 64, 64] + - [210, 2131.0] + - - [26, 26, 56, 64, 26, 26, 64, 64] + - [173, 891.0] + - - [26, 27, 56, 64, 26, 26, 64, 64] + - [173, 918.0] + - - [27, 27, 56, 64, 27, 27, 64, 64] + - [173, 961.0] + - - [27, 27, 1168, 64, 27, 27, 64, 64] + - [217, 2461.0] + - - [29, 29, 136, 64, 29, 29, 64, 64] + - [173, 1785.0] + - - [29, 29, 1088, 64, 29, 29, 64, 64] + - [201, 2848.0] + - - [256, 1, 1, 4, 256, 256, 4, 4] + - [169, 1.0] + - - [2, 1, 1, 1024, 2, 2, 1024, 1024] + - [192, 0.16] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 77.0] + - - [2, 6, 1, 1024, 2, 2, 1024, 1024] + - [169, 1.0] + - - [2, 8, 1, 1024, 2, 2, 1024, 1024] + - [169, 1.0] + - - [14, 14, 1, 64, 14, 14, 64, 64] + - [169, 7.0] + - - [15, 14, 1, 64, 15, 15, 64, 64] + - [169, 8.0] + - - [15, 15, 1, 64, 15, 15, 64, 64] + - [187, 9.0] + - - [17, 15, 1, 64, 17, 17, 64, 64] + - [171, 9.0] + - - [17, 17, 1, 64, 17, 17, 64, 64] + - [211, 9.0] + - - [30, 30, 1, 64, 30, 30, 64, 64] + - [180, 28.0] + - - [30, 31, 1, 64, 30, 30, 64, 64] + - [171, 28.0] + - - [31, 31, 1, 64, 31, 31, 64, 64] + - [180, 29.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2273.0] + - - [2, 32, 1, 1024, 2, 2, 1024, 1024] + - [169, 5.0] + - - [2, 4, 1, 1024, 2, 2, 1024, 1024] + - [169, 1.0] + - - [64, 512, 1, 512, 64, 64, 512, 512] + - [178, 2102.0] + - - [64, 960, 1, 1024, 64, 64, 1024, 1024] + - [178, 3019.0] + - - [200, 1, 1, 1024, 200, 200, 1024, 1024] + - [184, 16.0] + - - [512, 1, 1, 2048, 512, 512, 2048, 2048] + - [184, 45.0] + - - [64, 512, 1, 1024, 64, 64, 1024, 1024] + - [178, 2327.0] + - - [3, 3, 512, 64, 3, 3, 64, 64] + - [169, 76.0] + - - [5, 5, 512, 64, 5, 5, 64, 64] + - [202, 207.0] + - - [9, 9, 512, 64, 9, 9, 64, 64] + - [178, 638.0] + - - [128, 256, 1, 1444, 128, 128, 1444, 1444] + - [175, 2114.0] + - - [256, 128, 1, 25, 256, 256, 25, 25] + - [173, 440.0] + - - [256, 128, 1, 9, 256, 256, 9, 9] + - [173, 184.0] + - - [256, 256, 1, 1444, 256, 256, 1444, 1444] + - [197, 3197.0] + - - [512, 128, 1, 100, 512, 512, 100, 100] + - [202, 1905.0] + - - [64, 128, 1, 1444, 64, 64, 1444, 1444] + - [177, 665.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 3250.0] + - - [2, 10, 1, 1024, 2, 2, 1024, 1024] + - [175, 2.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 763.0] + - - [2, 39, 1, 1024, 2, 2, 1024, 1024] + - [169, 6.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2369.0] + - - [2, 40, 1, 1024, 2, 2, 1024, 1024] + - [169, 6.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2430.0] + - - [2, 41, 1, 1024, 2, 2, 1024, 1024] + - [169, 6.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2491.0] + - - [2, 5, 1, 1024, 2, 2, 1024, 1024] + - [169, 1.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [210, 383.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 612.0] + - - [2, 9, 1, 1024, 2, 2, 1024, 1024] + - [169, 1.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 689.0] + - - [4, 4, 32768, 64, 4, 4, 64, 64] + - [187, 234.0] + - - [4, 4, 38400, 64, 4, 4, 64, 64] + - [194, 234.0] + - - [14, 14, 10880, 64, 14, 14, 64, 64] + - [202, 2564.0] + - - [15, 14, 10880, 64, 15, 15, 64, 64] + - [202, 2724.0] + - - [15, 15, 7680, 64, 15, 15, 64, 64] + - [175, 2619.0] + - - [15, 15, 10880, 64, 15, 15, 64, 64] + - [169, 2706.0] + - - [17, 15, 7680, 64, 17, 17, 64, 64] + - [187, 1839.0] + - - [17, 17, 6144, 64, 17, 17, 64, 64] + - [170, 1635.0] + - - [17, 17, 7680, 64, 17, 17, 64, 64] + - [188, 1692.0] + - - [21, 17, 6144, 64, 21, 21, 64, 64] + - [188, 2018.0] + - - [21, 21, 6144, 64, 21, 21, 64, 64] + - [211, 2191.0] + - - [24, 24, 4736, 64, 24, 24, 64, 64] + - [208, 2237.0] + - - [30, 30, 2048, 64, 30, 30, 64, 64] + - [176, 3306.0] + - - [30, 31, 2048, 64, 30, 30, 64, 64] + - [201, 3388.0] + - - [31, 31, 2048, 64, 31, 31, 64, 64] + - [185, 3483.0] + - - [34, 24, 4736, 64, 34, 34, 64, 64] + - [172, 2597.0] + - - [128, 128, 1, 64, 128, 128, 64, 64] + - [171, 464.0] + - - [2, 1024, 1, 1024, 2, 2, 1024, 1024] + - [178, 156.0] + - - [5, 5, 1, 64, 5, 5, 64, 64] + - [169, 1.0] + - - [33, 33, 1, 32, 33, 33, 32, 32] + - [169, 17.0] + - - [5, 5, 960, 64, 5, 5, 64, 64] + - [169, 263.0] + - - [27, 27, 32768, 128, 27, 27, 128, 128] + - [201, 2352.0] + - - [960, 1, 1, 2048, 960, 960, 2048, 2048] + - [194, 75.0] + - - [2, 2, 1, 2048, 2, 2, 2048, 2048] + - [175, 0.34] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 1235.0] + - - [2, 16, 1, 1024, 2, 2, 1024, 1024] + - [169, 2.0] + - - [2, 4, 1, 2560, 2, 2, 2560, 2560] + - [169, 1.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 3056.0] + - - [2, 64, 1, 1024, 2, 2, 1024, 1024] + - [169, 10.0] + - - [864, 1, 1, 256, 864, 864, 256, 256] + - [178, 46.0] + - - [2, 80, 1, 1024, 2, 2, 1024, 1024] + - [171, 12.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 3407.0] + - - [2, 82, 1, 1024, 2, 2, 1024, 1024] + - [184, 13.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 917.0] + - - [2, 12, 1, 1024, 2, 2, 1024, 1024] + - [169, 2.0] + - - [24, 24, 6816, 64, 24, 24, 64, 64] + - [217, 2267.0] + - - [26, 26, 6272, 64, 26, 26, 64, 64] + - [185, 2658.0] + - - [256, 128, 1, 3136, 256, 256, 3136, 3136] + - [169, 2628.0] + - - [2, 128, 1, 1024, 2, 2, 1024, 1024] + - [169, 19.0] + - - [2, 96, 1, 1024, 2, 2, 1024, 1024] + - [184, 15.0] + - - [768, 12, 1, 768, 768, 768, 768, 768] + - [178, 668.0] + - - [768, 4, 1, 768, 768, 768, 768, 768] + - [194, 223.0] + - - [256, 80, 1, 784, 256, 256, 784, 784] + - [207, 1468.0] + - - [256, 12, 1, 3800, 256, 256, 3800, 3800] + - [177, 274.0] + - - [256, 3, 1, 3800, 256, 256, 3800, 3800] + - [192, 71.0] + - - [256, 12, 1, 950, 256, 256, 950, 950] + - [200, 242.0] + - - [256, 3, 1, 950, 256, 256, 950, 950] + - [192, 61.0] + - - [256, 12, 1, 3220, 256, 256, 3220, 3220] + - [177, 271.0] + - - [256, 3, 1, 3220, 256, 256, 3220, 3220] + - [175, 70.0] + - - [256, 12, 1, 3072, 256, 256, 3072, 3072] + - [200, 274.0] + - - [256, 3, 1, 3072, 256, 256, 3072, 3072] + - [184, 69.0] + - - [256, 12, 1, 850, 256, 256, 850, 850] + - [200, 238.0] + - - [256, 3, 1, 850, 256, 256, 850, 850] + - [175, 60.0] + - - [256, 12, 1, 2852, 256, 256, 2852, 2852] + - [177, 269.0] + - - [256, 3, 1, 2852, 256, 256, 2852, 2852] + - [175, 69.0] + - - [256, 12, 1, 805, 256, 256, 805, 805] + - [184, 234.0] + - - [256, 3, 1, 805, 256, 256, 805, 805] + - [175, 59.0] + - - [256, 3, 1, 864, 256, 256, 864, 864] + - [175, 60.0] + - - [256, 3, 1, 768, 256, 256, 768, 768] + - [184, 58.0] + - - [256, 12, 1, 864, 256, 256, 864, 864] + - [184, 239.0] + - - [256, 12, 1, 768, 256, 256, 768, 768] + - [184, 233.0] + - - [256, 12, 1, 2904, 256, 256, 2904, 2904] + - [177, 269.0] + - - [256, 3, 1, 2904, 256, 256, 2904, 2904] + - [175, 69.0] + - - [256, 3, 1, 713, 256, 256, 713, 713] + - [192, 58.0] + - - [256, 12, 1, 888, 256, 256, 888, 888] + - [200, 240.0] + - - [256, 3, 1, 888, 256, 256, 888, 888] + - [175, 60.0] + - - [256, 12, 1, 713, 256, 256, 713, 713] + - [184, 227.0] + - - [256, 3, 1, 660, 256, 256, 660, 660] + - [175, 57.0] + - - [256, 3, 1, 672, 256, 256, 672, 672] + - [175, 57.0] + - - [256, 12, 1, 660, 256, 256, 660, 660] + - [200, 225.0] + - - [256, 3, 1, 726, 256, 256, 726, 726] + - [192, 58.0] + - - [256, 12, 1, 672, 256, 256, 672, 672] + - [184, 226.0] + - - [256, 3, 1, 247, 256, 256, 247, 247] + - [175, 40.0] + - - [256, 12, 1, 726, 256, 256, 726, 726] + - [200, 230.0] + - - [256, 3, 1, 216, 256, 256, 216, 216] + - [175, 38.0] + - - [256, 3, 1, 3400, 256, 256, 3400, 3400] + - [175, 70.0] + - - [256, 3, 1, 221, 256, 256, 221, 221] + - [175, 38.0] + - - [256, 12, 1, 3552, 256, 256, 3552, 3552] + - [177, 274.0] + - - [256, 3, 1, 3456, 256, 256, 3456, 3456] + - [175, 70.0] + - - [256, 3, 1, 204, 256, 256, 204, 204] + - [175, 36.0] + - - [256, 12, 1, 3400, 256, 256, 3400, 3400] + - [177, 273.0] + - - [256, 12, 1, 3456, 256, 256, 3456, 3456] + - [184, 275.0] + - - [256, 12, 1, 221, 256, 256, 221, 221] + - [200, 152.0] + - - [256, 3, 1, 3552, 256, 256, 3552, 3552] + - [175, 70.0] + - - [256, 3, 1, 228, 256, 256, 228, 228] + - [175, 39.0] + - - [256, 3, 1, 234, 256, 256, 234, 234] + - [175, 39.0] + - - [256, 12, 1, 234, 256, 256, 234, 234] + - [216, 156.0] + - - [81, 1024, 1, 1024, 81, 81, 1024, 1024] + - [211, 3341.0] + - - [81, 1000, 1, 1024, 81, 81, 1024, 1024] + - [211, 3284.0] + - - [256, 12, 1, 228, 256, 256, 228, 228] + - [184, 156.0] + - - [256, 3, 1, 252, 256, 256, 252, 252] + - [175, 41.0] + - - [256, 12, 1, 252, 256, 256, 252, 252] + - [200, 161.0] + - - [256, 12, 1, 247, 256, 256, 247, 247] + - [216, 161.0] + - - [1024, 6, 1, 2, 1024, 1024, 2, 2] + - [169, 8.0] + - - [2, 8, 1, 2048, 2, 2, 2048, 2048] + - [169, 1.0] + - - [2, 20, 1, 1024, 2, 2, 1024, 1024] + - [169, 3.0] + - - [2, 2, 1, 2560, 2, 2, 2560, 2560] + - [175, 0.36] +- null